You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

811 lines
26KB

  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Understanding the Kiva Dataset\n",
  8. "\n",
  9. "Before we actually get into the work of predicting anything based on the data Kiva makes public, we first want to get a better picture of what the dataset actually looks like.\n",
  10. "\n",
  11. "Our first step: What is the schema of the data? Spark SQL will make it easy to query data in the future, but we need to know first what is available."
  12. ]
  13. },
  14. {
  15. "cell_type": "code",
  16. "execution_count": 1,
  17. "metadata": {
  18. "collapsed": true
  19. },
  20. "outputs": [],
  21. "source": [
  22. "sparkSql = (SparkSession.builder\n",
  23. " .master(\"local\")\n",
  24. " .appName(\"Kiva Exploration\")\n",
  25. " .getOrCreate())\n",
  26. "\n",
  27. "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
  28. "loans.registerTempTable('loans')\n",
  29. "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
  30. "lenders.registerTempTable('lenders')\n",
  31. "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
  32. "loans_lenders.registerTempTable('loans_lenders')"
  33. ]
  34. },
  35. {
  36. "cell_type": "code",
  37. "execution_count": 2,
  38. "metadata": {
  39. "collapsed": false
  40. },
  41. "outputs": [
  42. {
  43. "name": "stdout",
  44. "output_type": "stream",
  45. "text": [
  46. "root\n",
  47. " |-- activity: string (nullable = true)\n",
  48. " |-- basket_amount: long (nullable = true)\n",
  49. " |-- bonus_credit_eligibility: boolean (nullable = true)\n",
  50. " |-- borrowers: array (nullable = true)\n",
  51. " | |-- element: struct (containsNull = true)\n",
  52. " | | |-- first_name: string (nullable = true)\n",
  53. " | | |-- gender: string (nullable = true)\n",
  54. " | | |-- last_name: string (nullable = true)\n",
  55. " | | |-- pictured: boolean (nullable = true)\n",
  56. " |-- currency_exchange_loss_amount: double (nullable = true)\n",
  57. " |-- delinquent: boolean (nullable = true)\n",
  58. " |-- description: struct (nullable = true)\n",
  59. " | |-- languages: array (nullable = true)\n",
  60. " | | |-- element: string (containsNull = true)\n",
  61. " | |-- texts: struct (nullable = true)\n",
  62. " | | |-- ar: string (nullable = true)\n",
  63. " | | |-- en: string (nullable = true)\n",
  64. " | | |-- es: string (nullable = true)\n",
  65. " | | |-- fr: string (nullable = true)\n",
  66. " | | |-- id: string (nullable = true)\n",
  67. " | | |-- mn: string (nullable = true)\n",
  68. " | | |-- pt: string (nullable = true)\n",
  69. " | | |-- ru: string (nullable = true)\n",
  70. " | | |-- vi: string (nullable = true)\n",
  71. " |-- funded_amount: long (nullable = true)\n",
  72. " |-- funded_date: string (nullable = true)\n",
  73. " |-- id: long (nullable = true)\n",
  74. " |-- image: struct (nullable = true)\n",
  75. " | |-- id: long (nullable = true)\n",
  76. " | |-- template_id: long (nullable = true)\n",
  77. " |-- journal_totals: struct (nullable = true)\n",
  78. " | |-- bulkEntries: long (nullable = true)\n",
  79. " | |-- entries: long (nullable = true)\n",
  80. " |-- lender_count: long (nullable = true)\n",
  81. " |-- loan_amount: long (nullable = true)\n",
  82. " |-- location: struct (nullable = true)\n",
  83. " | |-- country: string (nullable = true)\n",
  84. " | |-- country_code: string (nullable = true)\n",
  85. " | |-- geo: struct (nullable = true)\n",
  86. " | | |-- level: string (nullable = true)\n",
  87. " | | |-- pairs: string (nullable = true)\n",
  88. " | | |-- type: string (nullable = true)\n",
  89. " | |-- town: string (nullable = true)\n",
  90. " |-- name: string (nullable = true)\n",
  91. " |-- paid_amount: double (nullable = true)\n",
  92. " |-- paid_date: string (nullable = true)\n",
  93. " |-- partner_id: long (nullable = true)\n",
  94. " |-- payments: array (nullable = true)\n",
  95. " | |-- element: struct (containsNull = true)\n",
  96. " | | |-- amount: double (nullable = true)\n",
  97. " | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
  98. " | | |-- local_amount: double (nullable = true)\n",
  99. " | | |-- payment_id: long (nullable = true)\n",
  100. " | | |-- processed_date: string (nullable = true)\n",
  101. " | | |-- rounded_local_amount: double (nullable = true)\n",
  102. " | | |-- settlement_date: string (nullable = true)\n",
  103. " |-- planned_expiration_date: string (nullable = true)\n",
  104. " |-- posted_date: string (nullable = true)\n",
  105. " |-- sector: string (nullable = true)\n",
  106. " |-- status: string (nullable = true)\n",
  107. " |-- tags: array (nullable = true)\n",
  108. " | |-- element: struct (containsNull = true)\n",
  109. " | | |-- name: string (nullable = true)\n",
  110. " |-- terms: struct (nullable = true)\n",
  111. " | |-- disbursal_amount: double (nullable = true)\n",
  112. " | |-- disbursal_currency: string (nullable = true)\n",
  113. " | |-- disbursal_date: string (nullable = true)\n",
  114. " | |-- loan_amount: long (nullable = true)\n",
  115. " | |-- local_payments: array (nullable = true)\n",
  116. " | | |-- element: struct (containsNull = true)\n",
  117. " | | | |-- amount: double (nullable = true)\n",
  118. " | | | |-- due_date: string (nullable = true)\n",
  119. " | |-- loss_liability: struct (nullable = true)\n",
  120. " | | |-- currency_exchange: string (nullable = true)\n",
  121. " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
  122. " | | |-- nonpayment: string (nullable = true)\n",
  123. " | |-- repayment_interval: string (nullable = true)\n",
  124. " | |-- repayment_term: long (nullable = true)\n",
  125. " | |-- scheduled_payments: array (nullable = true)\n",
  126. " | | |-- element: struct (containsNull = true)\n",
  127. " | | | |-- amount: double (nullable = true)\n",
  128. " | | | |-- due_date: string (nullable = true)\n",
  129. " |-- themes: array (nullable = true)\n",
  130. " | |-- element: string (containsNull = true)\n",
  131. " |-- translator: struct (nullable = true)\n",
  132. " | |-- byline: string (nullable = true)\n",
  133. " | |-- image: long (nullable = true)\n",
  134. " |-- use: string (nullable = true)\n",
  135. " |-- video: struct (nullable = true)\n",
  136. " | |-- id: long (nullable = true)\n",
  137. " | |-- thumbnailImageId: long (nullable = true)\n",
  138. " | |-- title: string (nullable = true)\n",
  139. " | |-- youtubeId: string (nullable = true)\n",
  140. "\n"
  141. ]
  142. }
  143. ],
  144. "source": [
  145. "loans.printSchema()"
  146. ]
  147. },
  148. {
  149. "cell_type": "code",
  150. "execution_count": 3,
  151. "metadata": {
  152. "collapsed": false
  153. },
  154. "outputs": [
  155. {
  156. "data": {
  157. "text/plain": [
  158. "[Row(status=u'refunded', count=5504),\n",
  159. " Row(status=u'defaulted', count=21776),\n",
  160. " Row(status=u'in_repayment', count=155749),\n",
  161. " Row(status=u'reviewed', count=3),\n",
  162. " Row(status=u'deleted', count=2721),\n",
  163. " Row(status=u'paid', count=775330),\n",
  164. " Row(status=u'issue', count=199),\n",
  165. " Row(status=u'inactive_expired', count=12421),\n",
  166. " Row(status=u'fundraising', count=3986),\n",
  167. " Row(status=u'expired', count=33773),\n",
  168. " Row(status=u'inactive', count=2493),\n",
  169. " Row(status=u'funded', count=173),\n",
  170. " Row(status=u'', count=2)]"
  171. ]
  172. },
  173. "execution_count": 3,
  174. "metadata": {},
  175. "output_type": "execute_result"
  176. }
  177. ],
  178. "source": [
  179. "loans.groupby(loans.status).count().collect()"
  180. ]
  181. },
  182. {
  183. "cell_type": "code",
  184. "execution_count": 4,
  185. "metadata": {
  186. "collapsed": false
  187. },
  188. "outputs": [
  189. {
  190. "data": {
  191. "text/plain": [
  192. "[Row(delinquent=None, count=970465), Row(delinquent=True, count=43665)]"
  193. ]
  194. },
  195. "execution_count": 4,
  196. "metadata": {},
  197. "output_type": "execute_result"
  198. }
  199. ],
  200. "source": [
  201. "loans.groupby(loans.delinquent).count().collect()"
  202. ]
  203. },
  204. {
  205. "cell_type": "code",
  206. "execution_count": 6,
  207. "metadata": {
  208. "collapsed": false
  209. },
  210. "outputs": [
  211. {
  212. "data": {
  213. "text/plain": [
  214. "[Row(status=u'refunded', count=156),\n",
  215. " Row(status=u'defaulted', count=20116),\n",
  216. " Row(status=u'in_repayment', count=23393)]"
  217. ]
  218. },
  219. "execution_count": 6,
  220. "metadata": {},
  221. "output_type": "execute_result"
  222. }
  223. ],
  224. "source": [
  225. "loans.where(loans.delinquent == True).groupby(loans.status).count().collect()"
  226. ]
  227. },
  228. {
  229. "cell_type": "code",
  230. "execution_count": 4,
  231. "metadata": {
  232. "collapsed": false
  233. },
  234. "outputs": [
  235. {
  236. "data": {
  237. "text/html": [
  238. "<div>\n",
  239. "<table border=\"1\" class=\"dataframe\">\n",
  240. " <thead>\n",
  241. " <tr style=\"text-align: right;\">\n",
  242. " <th></th>\n",
  243. " <th>country</th>\n",
  244. " <th>bad_loan_pct</th>\n",
  245. " </tr>\n",
  246. " </thead>\n",
  247. " <tbody>\n",
  248. " <tr>\n",
  249. " <th>0</th>\n",
  250. " <td>Chad</td>\n",
  251. " <td>93.220339</td>\n",
  252. " </tr>\n",
  253. " <tr>\n",
  254. " <th>1</th>\n",
  255. " <td>Paraguay</td>\n",
  256. " <td>2.893067</td>\n",
  257. " </tr>\n",
  258. " <tr>\n",
  259. " <th>2</th>\n",
  260. " <td>Yemen</td>\n",
  261. " <td>24.980195</td>\n",
  262. " </tr>\n",
  263. " <tr>\n",
  264. " <th>3</th>\n",
  265. " <td>Senegal</td>\n",
  266. " <td>7.014218</td>\n",
  267. " </tr>\n",
  268. " <tr>\n",
  269. " <th>4</th>\n",
  270. " <td>Philippines</td>\n",
  271. " <td>2.515182</td>\n",
  272. " </tr>\n",
  273. " <tr>\n",
  274. " <th>5</th>\n",
  275. " <td>Turkey</td>\n",
  276. " <td>0.233645</td>\n",
  277. " </tr>\n",
  278. " <tr>\n",
  279. " <th>6</th>\n",
  280. " <td>Malawi</td>\n",
  281. " <td>13.125000</td>\n",
  282. " </tr>\n",
  283. " <tr>\n",
  284. " <th>7</th>\n",
  285. " <td>Iraq</td>\n",
  286. " <td>35.476593</td>\n",
  287. " </tr>\n",
  288. " <tr>\n",
  289. " <th>8</th>\n",
  290. " <td>Cambodia</td>\n",
  291. " <td>1.433968</td>\n",
  292. " </tr>\n",
  293. " <tr>\n",
  294. " <th>9</th>\n",
  295. " <td>Afghanistan</td>\n",
  296. " <td>26.235741</td>\n",
  297. " </tr>\n",
  298. " <tr>\n",
  299. " <th>10</th>\n",
  300. " <td>Rwanda</td>\n",
  301. " <td>6.648555</td>\n",
  302. " </tr>\n",
  303. " <tr>\n",
  304. " <th>11</th>\n",
  305. " <td>Jordan</td>\n",
  306. " <td>16.066482</td>\n",
  307. " </tr>\n",
  308. " <tr>\n",
  309. " <th>12</th>\n",
  310. " <td>Kosovo</td>\n",
  311. " <td>11.893370</td>\n",
  312. " </tr>\n",
  313. " <tr>\n",
  314. " <th>13</th>\n",
  315. " <td>Sri Lanka</td>\n",
  316. " <td>25.296443</td>\n",
  317. " </tr>\n",
  318. " <tr>\n",
  319. " <th>14</th>\n",
  320. " <td>Togo</td>\n",
  321. " <td>16.013720</td>\n",
  322. " </tr>\n",
  323. " <tr>\n",
  324. " <th>15</th>\n",
  325. " <td>Ecuador</td>\n",
  326. " <td>7.329710</td>\n",
  327. " </tr>\n",
  328. " <tr>\n",
  329. " <th>16</th>\n",
  330. " <td>Lesotho</td>\n",
  331. " <td>4.081633</td>\n",
  332. " </tr>\n",
  333. " <tr>\n",
  334. " <th>17</th>\n",
  335. " <td>Albania</td>\n",
  336. " <td>9.304468</td>\n",
  337. " </tr>\n",
  338. " <tr>\n",
  339. " <th>18</th>\n",
  340. " <td>Madagascar</td>\n",
  341. " <td>3.710247</td>\n",
  342. " </tr>\n",
  343. " <tr>\n",
  344. " <th>19</th>\n",
  345. " <td>Nicaragua</td>\n",
  346. " <td>10.434992</td>\n",
  347. " </tr>\n",
  348. " <tr>\n",
  349. " <th>20</th>\n",
  350. " <td>Ghana</td>\n",
  351. " <td>9.171567</td>\n",
  352. " </tr>\n",
  353. " <tr>\n",
  354. " <th>21</th>\n",
  355. " <td>Sierra Leone</td>\n",
  356. " <td>21.054804</td>\n",
  357. " </tr>\n",
  358. " <tr>\n",
  359. " <th>22</th>\n",
  360. " <td>Peru</td>\n",
  361. " <td>4.051516</td>\n",
  362. " </tr>\n",
  363. " <tr>\n",
  364. " <th>23</th>\n",
  365. " <td>Benin</td>\n",
  366. " <td>8.137597</td>\n",
  367. " </tr>\n",
  368. " <tr>\n",
  369. " <th>24</th>\n",
  370. " <td>India</td>\n",
  371. " <td>4.391382</td>\n",
  372. " </tr>\n",
  373. " <tr>\n",
  374. " <th>25</th>\n",
  375. " <td>United States</td>\n",
  376. " <td>34.945144</td>\n",
  377. " </tr>\n",
  378. " <tr>\n",
  379. " <th>26</th>\n",
  380. " <td>China</td>\n",
  381. " <td>0.000000</td>\n",
  382. " </tr>\n",
  383. " <tr>\n",
  384. " <th>27</th>\n",
  385. " <td>Timor-Leste</td>\n",
  386. " <td>9.585492</td>\n",
  387. " </tr>\n",
  388. " <tr>\n",
  389. " <th>28</th>\n",
  390. " <td>Lao PDR</td>\n",
  391. " <td>0.000000</td>\n",
  392. " </tr>\n",
  393. " <tr>\n",
  394. " <th>29</th>\n",
  395. " <td>Somalia</td>\n",
  396. " <td>17.316017</td>\n",
  397. " </tr>\n",
  398. " <tr>\n",
  399. " <th>...</th>\n",
  400. " <td>...</td>\n",
  401. " <td>...</td>\n",
  402. " </tr>\n",
  403. " <tr>\n",
  404. " <th>62</th>\n",
  405. " <td>Kyrgyzstan</td>\n",
  406. " <td>7.112293</td>\n",
  407. " </tr>\n",
  408. " <tr>\n",
  409. " <th>63</th>\n",
  410. " <td>Samoa</td>\n",
  411. " <td>3.421662</td>\n",
  412. " </tr>\n",
  413. " <tr>\n",
  414. " <th>64</th>\n",
  415. " <td>Mozambique</td>\n",
  416. " <td>10.196484</td>\n",
  417. " </tr>\n",
  418. " <tr>\n",
  419. " <th>65</th>\n",
  420. " <td>Brazil</td>\n",
  421. " <td>42.477876</td>\n",
  422. " </tr>\n",
  423. " <tr>\n",
  424. " <th>66</th>\n",
  425. " <td>Belize</td>\n",
  426. " <td>45.026178</td>\n",
  427. " </tr>\n",
  428. " <tr>\n",
  429. " <th>67</th>\n",
  430. " <td>Kenya</td>\n",
  431. " <td>12.214316</td>\n",
  432. " </tr>\n",
  433. " <tr>\n",
  434. " <th>68</th>\n",
  435. " <td>Lebanon</td>\n",
  436. " <td>6.509946</td>\n",
  437. " </tr>\n",
  438. " <tr>\n",
  439. " <th>69</th>\n",
  440. " <td>Dominican Republic</td>\n",
  441. " <td>14.982415</td>\n",
  442. " </tr>\n",
  443. " <tr>\n",
  444. " <th>70</th>\n",
  445. " <td>Tanzania</td>\n",
  446. " <td>11.312424</td>\n",
  447. " </tr>\n",
  448. " <tr>\n",
  449. " <th>71</th>\n",
  450. " <td>Botswana</td>\n",
  451. " <td>100.000000</td>\n",
  452. " </tr>\n",
  453. " <tr>\n",
  454. " <th>72</th>\n",
  455. " <td>Bosnia and Herzegovina</td>\n",
  456. " <td>5.536913</td>\n",
  457. " </tr>\n",
  458. " <tr>\n",
  459. " <th>73</th>\n",
  460. " <td>Haiti</td>\n",
  461. " <td>25.527192</td>\n",
  462. " </tr>\n",
  463. " <tr>\n",
  464. " <th>74</th>\n",
  465. " <td>Cameroon</td>\n",
  466. " <td>4.695009</td>\n",
  467. " </tr>\n",
  468. " <tr>\n",
  469. " <th>75</th>\n",
  470. " <td>Papua New Guinea</td>\n",
  471. " <td>0.000000</td>\n",
  472. " </tr>\n",
  473. " <tr>\n",
  474. " <th>76</th>\n",
  475. " <td>Solomon Islands</td>\n",
  476. " <td>0.000000</td>\n",
  477. " </tr>\n",
  478. " <tr>\n",
  479. " <th>77</th>\n",
  480. " <td>Bulgaria</td>\n",
  481. " <td>2.333333</td>\n",
  482. " </tr>\n",
  483. " <tr>\n",
  484. " <th>78</th>\n",
  485. " <td>St Vincent</td>\n",
  486. " <td>72.727273</td>\n",
  487. " </tr>\n",
  488. " <tr>\n",
  489. " <th>79</th>\n",
  490. " <td>Nepal</td>\n",
  491. " <td>0.397953</td>\n",
  492. " </tr>\n",
  493. " <tr>\n",
  494. " <th>80</th>\n",
  495. " <td>El Salvador</td>\n",
  496. " <td>17.143167</td>\n",
  497. " </tr>\n",
  498. " <tr>\n",
  499. " <th>81</th>\n",
  500. " <td>Egypt</td>\n",
  501. " <td>0.000000</td>\n",
  502. " </tr>\n",
  503. " <tr>\n",
  504. " <th>82</th>\n",
  505. " <td>Costa Rica</td>\n",
  506. " <td>8.917357</td>\n",
  507. " </tr>\n",
  508. " <tr>\n",
  509. " <th>83</th>\n",
  510. " <td>Congo (Rep.)</td>\n",
  511. " <td>1.198257</td>\n",
  512. " </tr>\n",
  513. " <tr>\n",
  514. " <th>84</th>\n",
  515. " <td>Burkina Faso</td>\n",
  516. " <td>2.172702</td>\n",
  517. " </tr>\n",
  518. " <tr>\n",
  519. " <th>85</th>\n",
  520. " <td>South Africa</td>\n",
  521. " <td>3.364486</td>\n",
  522. " </tr>\n",
  523. " <tr>\n",
  524. " <th>86</th>\n",
  525. " <td>Colombia</td>\n",
  526. " <td>21.693634</td>\n",
  527. " </tr>\n",
  528. " <tr>\n",
  529. " <th>87</th>\n",
  530. " <td>Pakistan</td>\n",
  531. " <td>6.085563</td>\n",
  532. " </tr>\n",
  533. " <tr>\n",
  534. " <th>88</th>\n",
  535. " <td>Vanuatu</td>\n",
  536. " <td>0.000000</td>\n",
  537. " </tr>\n",
  538. " <tr>\n",
  539. " <th>89</th>\n",
  540. " <td>Moldova</td>\n",
  541. " <td>8.549223</td>\n",
  542. " </tr>\n",
  543. " <tr>\n",
  544. " <th>90</th>\n",
  545. " <td>Vietnam</td>\n",
  546. " <td>5.473769</td>\n",
  547. " </tr>\n",
  548. " <tr>\n",
  549. " <th>91</th>\n",
  550. " <td>Mali</td>\n",
  551. " <td>10.650330</td>\n",
  552. " </tr>\n",
  553. " </tbody>\n",
  554. "</table>\n",
  555. "<p>92 rows × 2 columns</p>\n",
  556. "</div>"
  557. ],
  558. "text/plain": [
  559. " country bad_loan_pct\n",
  560. "0 Chad 93.220339\n",
  561. "1 Paraguay 2.893067\n",
  562. "2 Yemen 24.980195\n",
  563. "3 Senegal 7.014218\n",
  564. "4 Philippines 2.515182\n",
  565. "5 Turkey 0.233645\n",
  566. "6 Malawi 13.125000\n",
  567. "7 Iraq 35.476593\n",
  568. "8 Cambodia 1.433968\n",
  569. "9 Afghanistan 26.235741\n",
  570. "10 Rwanda 6.648555\n",
  571. "11 Jordan 16.066482\n",
  572. "12 Kosovo 11.893370\n",
  573. "13 Sri Lanka 25.296443\n",
  574. "14 Togo 16.013720\n",
  575. "15 Ecuador 7.329710\n",
  576. "16 Lesotho 4.081633\n",
  577. "17 Albania 9.304468\n",
  578. "18 Madagascar 3.710247\n",
  579. "19 Nicaragua 10.434992\n",
  580. "20 Ghana 9.171567\n",
  581. "21 Sierra Leone 21.054804\n",
  582. "22 Peru 4.051516\n",
  583. "23 Benin 8.137597\n",
  584. "24 India 4.391382\n",
  585. "25 United States 34.945144\n",
  586. "26 China 0.000000\n",
  587. "27 Timor-Leste 9.585492\n",
  588. "28 Lao PDR 0.000000\n",
  589. "29 Somalia 17.316017\n",
  590. ".. ... ...\n",
  591. "62 Kyrgyzstan 7.112293\n",
  592. "63 Samoa 3.421662\n",
  593. "64 Mozambique 10.196484\n",
  594. "65 Brazil 42.477876\n",
  595. "66 Belize 45.026178\n",
  596. "67 Kenya 12.214316\n",
  597. "68 Lebanon 6.509946\n",
  598. "69 Dominican Republic 14.982415\n",
  599. "70 Tanzania 11.312424\n",
  600. "71 Botswana 100.000000\n",
  601. "72 Bosnia and Herzegovina 5.536913\n",
  602. "73 Haiti 25.527192\n",
  603. "74 Cameroon 4.695009\n",
  604. "75 Papua New Guinea 0.000000\n",
  605. "76 Solomon Islands 0.000000\n",
  606. "77 Bulgaria 2.333333\n",
  607. "78 St Vincent 72.727273\n",
  608. "79 Nepal 0.397953\n",
  609. "80 El Salvador 17.143167\n",
  610. "81 Egypt 0.000000\n",
  611. "82 Costa Rica 8.917357\n",
  612. "83 Congo (Rep.) 1.198257\n",
  613. "84 Burkina Faso 2.172702\n",
  614. "85 South Africa 3.364486\n",
  615. "86 Colombia 21.693634\n",
  616. "87 Pakistan 6.085563\n",
  617. "88 Vanuatu 0.000000\n",
  618. "89 Moldova 8.549223\n",
  619. "90 Vietnam 5.473769\n",
  620. "91 Mali 10.650330\n",
  621. "\n",
  622. "[92 rows x 2 columns]"
  623. ]
  624. },
  625. "execution_count": 4,
  626. "metadata": {},
  627. "output_type": "execute_result"
  628. }
  629. ],
  630. "source": [
  631. "bad_pct_df = sparkSql.sql('''\n",
  632. "SELECT\n",
  633. " sub.country,\n",
  634. " sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n",
  635. "FROM (SELECT\n",
  636. " SUM(CASE WHEN\n",
  637. " loans.status = 'refunded' OR\n",
  638. " loans.status = 'expired' OR\n",
  639. " loans.status = 'defaulted' OR\n",
  640. " loans.delinquent = True\n",
  641. " THEN 1\n",
  642. " ELSE 0 END) AS bad_loans,\n",
  643. " COUNT(*) AS total_loans,\n",
  644. " loans.location.country\n",
  645. " FROM loans\n",
  646. " GROUP BY\n",
  647. " loans.location.country\n",
  648. ") sub\n",
  649. "''').toPandas()\n",
  650. "\n",
  651. "bad_pct_df"
  652. ]
  653. },
  654. {
  655. "cell_type": "code",
  656. "execution_count": 8,
  657. "metadata": {
  658. "collapsed": false
  659. },
  660. "outputs": [
  661. {
  662. "data": {
  663. "text/html": [
  664. "<div>\n",
  665. "<table border=\"1\" class=\"dataframe\">\n",
  666. " <thead>\n",
  667. " <tr style=\"text-align: right;\">\n",
  668. " <th></th>\n",
  669. " <th>country</th>\n",
  670. " <th>bad_loan_pct</th>\n",
  671. " </tr>\n",
  672. " </thead>\n",
  673. " <tbody>\n",
  674. " <tr>\n",
  675. " <th>71</th>\n",
  676. " <td>Botswana</td>\n",
  677. " <td>100.000000</td>\n",
  678. " </tr>\n",
  679. " <tr>\n",
  680. " <th>47</th>\n",
  681. " <td>Gaza</td>\n",
  682. " <td>100.000000</td>\n",
  683. " </tr>\n",
  684. " <tr>\n",
  685. " <th>0</th>\n",
  686. " <td>Chad</td>\n",
  687. " <td>93.220339</td>\n",
  688. " </tr>\n",
  689. " <tr>\n",
  690. " <th>78</th>\n",
  691. " <td>St Vincent</td>\n",
  692. " <td>72.727273</td>\n",
  693. " </tr>\n",
  694. " <tr>\n",
  695. " <th>66</th>\n",
  696. " <td>Belize</td>\n",
  697. " <td>45.026178</td>\n",
  698. " </tr>\n",
  699. " </tbody>\n",
  700. "</table>\n",
  701. "</div>"
  702. ],
  703. "text/plain": [
  704. " country bad_loan_pct\n",
  705. "71 Botswana 100.000000\n",
  706. "47 Gaza 100.000000\n",
  707. "0 Chad 93.220339\n",
  708. "78 St Vincent 72.727273\n",
  709. "66 Belize 45.026178"
  710. ]
  711. },
  712. "execution_count": 8,
  713. "metadata": {},
  714. "output_type": "execute_result"
  715. }
  716. ],
  717. "source": [
  718. "bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()"
  719. ]
  720. },
  721. {
  722. "cell_type": "code",
  723. "execution_count": 9,
  724. "metadata": {
  725. "collapsed": false
  726. },
  727. "outputs": [
  728. {
  729. "data": {
  730. "text/html": [
  731. "<div>\n",
  732. "<table border=\"1\" class=\"dataframe\">\n",
  733. " <thead>\n",
  734. " <tr style=\"text-align: right;\">\n",
  735. " <th></th>\n",
  736. " <th>country</th>\n",
  737. " <th>bad_loan_pct</th>\n",
  738. " </tr>\n",
  739. " </thead>\n",
  740. " <tbody>\n",
  741. " <tr>\n",
  742. " <th>26</th>\n",
  743. " <td>China</td>\n",
  744. " <td>0</td>\n",
  745. " </tr>\n",
  746. " <tr>\n",
  747. " <th>41</th>\n",
  748. " <td>Cape Verde</td>\n",
  749. " <td>0</td>\n",
  750. " </tr>\n",
  751. " <tr>\n",
  752. " <th>39</th>\n",
  753. " <td>Thailand</td>\n",
  754. " <td>0</td>\n",
  755. " </tr>\n",
  756. " <tr>\n",
  757. " <th>38</th>\n",
  758. " <td>Bangladesh</td>\n",
  759. " <td>0</td>\n",
  760. " </tr>\n",
  761. " <tr>\n",
  762. " <th>37</th>\n",
  763. " <td>Mauritania</td>\n",
  764. " <td>0</td>\n",
  765. " </tr>\n",
  766. " </tbody>\n",
  767. "</table>\n",
  768. "</div>"
  769. ],
  770. "text/plain": [
  771. " country bad_loan_pct\n",
  772. "26 China 0\n",
  773. "41 Cape Verde 0\n",
  774. "39 Thailand 0\n",
  775. "38 Bangladesh 0\n",
  776. "37 Mauritania 0"
  777. ]
  778. },
  779. "execution_count": 9,
  780. "metadata": {},
  781. "output_type": "execute_result"
  782. }
  783. ],
  784. "source": [
  785. "bad_pct_df.sort_values(by='bad_loan_pct').head()"
  786. ]
  787. }
  788. ],
  789. "metadata": {
  790. "kernelspec": {
  791. "display_name": "Python 2",
  792. "language": "python",
  793. "name": "python2"
  794. },
  795. "language_info": {
  796. "codemirror_mode": {
  797. "name": "ipython",
  798. "version": 2
  799. },
  800. "file_extension": ".py",
  801. "mimetype": "text/x-python",
  802. "name": "python",
  803. "nbconvert_exporter": "python",
  804. "pygments_lexer": "ipython2",
  805. "version": "2.7.12"
  806. }
  807. },
  808. "nbformat": 4,
  809. "nbformat_minor": 1
  810. }