1
0
mirror of https://github.com/bspeice/kiva-dig synced 2025-01-15 00:20:04 -05:00

Add a non-trivial SQL example

This commit is contained in:
Bradlee Speice 2016-11-02 18:59:00 -04:00
parent f32e0b37f1
commit c9ec434263
2 changed files with 1356 additions and 10 deletions

View File

@ -0,0 +1,810 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Understanding the Kiva Dataset\n",
"\n",
"Before we actually get into the work of predicting anything based on the data Kiva makes public, we first want to get a better picture of what the dataset actually looks like.\n",
"\n",
"Our first step: What is the schema of the data? Spark SQL will make it easy to query data in the future, but we need to know first what is available."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sparkSql = (SparkSession.builder\n",
" .master(\"local\")\n",
" .appName(\"Kiva Exploration\")\n",
" .getOrCreate())\n",
"\n",
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
"loans.registerTempTable('loans')\n",
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
"lenders.registerTempTable('lenders')\n",
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
"loans_lenders.registerTempTable('loans_lenders')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- activity: string (nullable = true)\n",
" |-- basket_amount: long (nullable = true)\n",
" |-- bonus_credit_eligibility: boolean (nullable = true)\n",
" |-- borrowers: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- first_name: string (nullable = true)\n",
" | | |-- gender: string (nullable = true)\n",
" | | |-- last_name: string (nullable = true)\n",
" | | |-- pictured: boolean (nullable = true)\n",
" |-- currency_exchange_loss_amount: double (nullable = true)\n",
" |-- delinquent: boolean (nullable = true)\n",
" |-- description: struct (nullable = true)\n",
" | |-- languages: array (nullable = true)\n",
" | | |-- element: string (containsNull = true)\n",
" | |-- texts: struct (nullable = true)\n",
" | | |-- ar: string (nullable = true)\n",
" | | |-- en: string (nullable = true)\n",
" | | |-- es: string (nullable = true)\n",
" | | |-- fr: string (nullable = true)\n",
" | | |-- id: string (nullable = true)\n",
" | | |-- mn: string (nullable = true)\n",
" | | |-- pt: string (nullable = true)\n",
" | | |-- ru: string (nullable = true)\n",
" | | |-- vi: string (nullable = true)\n",
" |-- funded_amount: long (nullable = true)\n",
" |-- funded_date: string (nullable = true)\n",
" |-- id: long (nullable = true)\n",
" |-- image: struct (nullable = true)\n",
" | |-- id: long (nullable = true)\n",
" | |-- template_id: long (nullable = true)\n",
" |-- journal_totals: struct (nullable = true)\n",
" | |-- bulkEntries: long (nullable = true)\n",
" | |-- entries: long (nullable = true)\n",
" |-- lender_count: long (nullable = true)\n",
" |-- loan_amount: long (nullable = true)\n",
" |-- location: struct (nullable = true)\n",
" | |-- country: string (nullable = true)\n",
" | |-- country_code: string (nullable = true)\n",
" | |-- geo: struct (nullable = true)\n",
" | | |-- level: string (nullable = true)\n",
" | | |-- pairs: string (nullable = true)\n",
" | | |-- type: string (nullable = true)\n",
" | |-- town: string (nullable = true)\n",
" |-- name: string (nullable = true)\n",
" |-- paid_amount: double (nullable = true)\n",
" |-- paid_date: string (nullable = true)\n",
" |-- partner_id: long (nullable = true)\n",
" |-- payments: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- amount: double (nullable = true)\n",
" | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
" | | |-- local_amount: double (nullable = true)\n",
" | | |-- payment_id: long (nullable = true)\n",
" | | |-- processed_date: string (nullable = true)\n",
" | | |-- rounded_local_amount: double (nullable = true)\n",
" | | |-- settlement_date: string (nullable = true)\n",
" |-- planned_expiration_date: string (nullable = true)\n",
" |-- posted_date: string (nullable = true)\n",
" |-- sector: string (nullable = true)\n",
" |-- status: string (nullable = true)\n",
" |-- tags: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- name: string (nullable = true)\n",
" |-- terms: struct (nullable = true)\n",
" | |-- disbursal_amount: double (nullable = true)\n",
" | |-- disbursal_currency: string (nullable = true)\n",
" | |-- disbursal_date: string (nullable = true)\n",
" | |-- loan_amount: long (nullable = true)\n",
" | |-- local_payments: array (nullable = true)\n",
" | | |-- element: struct (containsNull = true)\n",
" | | | |-- amount: double (nullable = true)\n",
" | | | |-- due_date: string (nullable = true)\n",
" | |-- loss_liability: struct (nullable = true)\n",
" | | |-- currency_exchange: string (nullable = true)\n",
" | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
" | | |-- nonpayment: string (nullable = true)\n",
" | |-- repayment_interval: string (nullable = true)\n",
" | |-- repayment_term: long (nullable = true)\n",
" | |-- scheduled_payments: array (nullable = true)\n",
" | | |-- element: struct (containsNull = true)\n",
" | | | |-- amount: double (nullable = true)\n",
" | | | |-- due_date: string (nullable = true)\n",
" |-- themes: array (nullable = true)\n",
" | |-- element: string (containsNull = true)\n",
" |-- translator: struct (nullable = true)\n",
" | |-- byline: string (nullable = true)\n",
" | |-- image: long (nullable = true)\n",
" |-- use: string (nullable = true)\n",
" |-- video: struct (nullable = true)\n",
" | |-- id: long (nullable = true)\n",
" | |-- thumbnailImageId: long (nullable = true)\n",
" | |-- title: string (nullable = true)\n",
" | |-- youtubeId: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"loans.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(status=u'refunded', count=5504),\n",
" Row(status=u'defaulted', count=21776),\n",
" Row(status=u'in_repayment', count=155749),\n",
" Row(status=u'reviewed', count=3),\n",
" Row(status=u'deleted', count=2721),\n",
" Row(status=u'paid', count=775330),\n",
" Row(status=u'issue', count=199),\n",
" Row(status=u'inactive_expired', count=12421),\n",
" Row(status=u'fundraising', count=3986),\n",
" Row(status=u'expired', count=33773),\n",
" Row(status=u'inactive', count=2493),\n",
" Row(status=u'funded', count=173),\n",
" Row(status=u'', count=2)]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loans.groupby(loans.status).count().collect()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(delinquent=None, count=970465), Row(delinquent=True, count=43665)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loans.groupby(loans.delinquent).count().collect()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(status=u'refunded', count=156),\n",
" Row(status=u'defaulted', count=20116),\n",
" Row(status=u'in_repayment', count=23393)]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loans.where(loans.delinquent == True).groupby(loans.status).count().collect()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>bad_loan_pct</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Chad</td>\n",
" <td>93.220339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Paraguay</td>\n",
" <td>2.893067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Yemen</td>\n",
" <td>24.980195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Senegal</td>\n",
" <td>7.014218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Philippines</td>\n",
" <td>2.515182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Turkey</td>\n",
" <td>0.233645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Malawi</td>\n",
" <td>13.125000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Iraq</td>\n",
" <td>35.476593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Cambodia</td>\n",
" <td>1.433968</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Afghanistan</td>\n",
" <td>26.235741</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Rwanda</td>\n",
" <td>6.648555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Jordan</td>\n",
" <td>16.066482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Kosovo</td>\n",
" <td>11.893370</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Sri Lanka</td>\n",
" <td>25.296443</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Togo</td>\n",
" <td>16.013720</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Ecuador</td>\n",
" <td>7.329710</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Lesotho</td>\n",
" <td>4.081633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Albania</td>\n",
" <td>9.304468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Madagascar</td>\n",
" <td>3.710247</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Nicaragua</td>\n",
" <td>10.434992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Ghana</td>\n",
" <td>9.171567</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Sierra Leone</td>\n",
" <td>21.054804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Peru</td>\n",
" <td>4.051516</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Benin</td>\n",
" <td>8.137597</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>India</td>\n",
" <td>4.391382</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>United States</td>\n",
" <td>34.945144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>China</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Timor-Leste</td>\n",
" <td>9.585492</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Lao PDR</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Somalia</td>\n",
" <td>17.316017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>Kyrgyzstan</td>\n",
" <td>7.112293</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>Samoa</td>\n",
" <td>3.421662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Mozambique</td>\n",
" <td>10.196484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>Brazil</td>\n",
" <td>42.477876</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>Belize</td>\n",
" <td>45.026178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>Kenya</td>\n",
" <td>12.214316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>Lebanon</td>\n",
" <td>6.509946</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>Dominican Republic</td>\n",
" <td>14.982415</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>Tanzania</td>\n",
" <td>11.312424</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>Botswana</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>Bosnia and Herzegovina</td>\n",
" <td>5.536913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>Haiti</td>\n",
" <td>25.527192</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>Cameroon</td>\n",
" <td>4.695009</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>Papua New Guinea</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>Solomon Islands</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>Bulgaria</td>\n",
" <td>2.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>St Vincent</td>\n",
" <td>72.727273</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>Nepal</td>\n",
" <td>0.397953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>El Salvador</td>\n",
" <td>17.143167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>Egypt</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>Costa Rica</td>\n",
" <td>8.917357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>Congo (Rep.)</td>\n",
" <td>1.198257</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>Burkina Faso</td>\n",
" <td>2.172702</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>South Africa</td>\n",
" <td>3.364486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>Colombia</td>\n",
" <td>21.693634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>Pakistan</td>\n",
" <td>6.085563</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>Vanuatu</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>Moldova</td>\n",
" <td>8.549223</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>Vietnam</td>\n",
" <td>5.473769</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>Mali</td>\n",
" <td>10.650330</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>92 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" country bad_loan_pct\n",
"0 Chad 93.220339\n",
"1 Paraguay 2.893067\n",
"2 Yemen 24.980195\n",
"3 Senegal 7.014218\n",
"4 Philippines 2.515182\n",
"5 Turkey 0.233645\n",
"6 Malawi 13.125000\n",
"7 Iraq 35.476593\n",
"8 Cambodia 1.433968\n",
"9 Afghanistan 26.235741\n",
"10 Rwanda 6.648555\n",
"11 Jordan 16.066482\n",
"12 Kosovo 11.893370\n",
"13 Sri Lanka 25.296443\n",
"14 Togo 16.013720\n",
"15 Ecuador 7.329710\n",
"16 Lesotho 4.081633\n",
"17 Albania 9.304468\n",
"18 Madagascar 3.710247\n",
"19 Nicaragua 10.434992\n",
"20 Ghana 9.171567\n",
"21 Sierra Leone 21.054804\n",
"22 Peru 4.051516\n",
"23 Benin 8.137597\n",
"24 India 4.391382\n",
"25 United States 34.945144\n",
"26 China 0.000000\n",
"27 Timor-Leste 9.585492\n",
"28 Lao PDR 0.000000\n",
"29 Somalia 17.316017\n",
".. ... ...\n",
"62 Kyrgyzstan 7.112293\n",
"63 Samoa 3.421662\n",
"64 Mozambique 10.196484\n",
"65 Brazil 42.477876\n",
"66 Belize 45.026178\n",
"67 Kenya 12.214316\n",
"68 Lebanon 6.509946\n",
"69 Dominican Republic 14.982415\n",
"70 Tanzania 11.312424\n",
"71 Botswana 100.000000\n",
"72 Bosnia and Herzegovina 5.536913\n",
"73 Haiti 25.527192\n",
"74 Cameroon 4.695009\n",
"75 Papua New Guinea 0.000000\n",
"76 Solomon Islands 0.000000\n",
"77 Bulgaria 2.333333\n",
"78 St Vincent 72.727273\n",
"79 Nepal 0.397953\n",
"80 El Salvador 17.143167\n",
"81 Egypt 0.000000\n",
"82 Costa Rica 8.917357\n",
"83 Congo (Rep.) 1.198257\n",
"84 Burkina Faso 2.172702\n",
"85 South Africa 3.364486\n",
"86 Colombia 21.693634\n",
"87 Pakistan 6.085563\n",
"88 Vanuatu 0.000000\n",
"89 Moldova 8.549223\n",
"90 Vietnam 5.473769\n",
"91 Mali 10.650330\n",
"\n",
"[92 rows x 2 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bad_pct_df = sparkSql.sql('''\n",
"SELECT\n",
" sub.country,\n",
" sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n",
"FROM (SELECT\n",
" SUM(CASE WHEN\n",
" loans.status = 'refunded' OR\n",
" loans.status = 'expired' OR\n",
" loans.status = 'defaulted' OR\n",
" loans.delinquent = True\n",
" THEN 1\n",
" ELSE 0 END) AS bad_loans,\n",
" COUNT(*) AS total_loans,\n",
" loans.location.country\n",
" FROM loans\n",
" GROUP BY\n",
" loans.location.country\n",
") sub\n",
"''').toPandas()\n",
"\n",
"bad_pct_df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>bad_loan_pct</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>Botswana</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>Gaza</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Chad</td>\n",
" <td>93.220339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>St Vincent</td>\n",
" <td>72.727273</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>Belize</td>\n",
" <td>45.026178</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country bad_loan_pct\n",
"71 Botswana 100.000000\n",
"47 Gaza 100.000000\n",
"0 Chad 93.220339\n",
"78 St Vincent 72.727273\n",
"66 Belize 45.026178"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>bad_loan_pct</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>China</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>Cape Verde</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>Thailand</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>Bangladesh</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>Mauritania</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country bad_loan_pct\n",
"26 China 0\n",
"41 Cape Verde 0\n",
"39 Thailand 0\n",
"38 Bangladesh 0\n",
"37 Mauritania 0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bad_pct_df.sort_values(by='bad_loan_pct').head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -25,8 +25,11 @@
" .getOrCreate())\n",
"\n",
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
"loans.registerTempTable('loans')\n",
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')"
"lenders.registerTempTable('lenders')\n",
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
"loans_lenders.registerTempTable('loans_lenders')"
]
},
{
@ -224,29 +227,562 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>bad_loan_pct</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Chad</td>\n",
" <td>93.220339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Paraguay</td>\n",
" <td>2.893067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Yemen</td>\n",
" <td>24.980195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Senegal</td>\n",
" <td>7.014218</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Philippines</td>\n",
" <td>2.515182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Turkey</td>\n",
" <td>0.233645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Malawi</td>\n",
" <td>13.125000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Iraq</td>\n",
" <td>35.476593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Cambodia</td>\n",
" <td>1.433968</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Afghanistan</td>\n",
" <td>26.235741</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Rwanda</td>\n",
" <td>6.648555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Jordan</td>\n",
" <td>16.066482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Kosovo</td>\n",
" <td>11.893370</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Sri Lanka</td>\n",
" <td>25.296443</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Togo</td>\n",
" <td>16.013720</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Ecuador</td>\n",
" <td>7.329710</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Lesotho</td>\n",
" <td>4.081633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Albania</td>\n",
" <td>9.304468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Madagascar</td>\n",
" <td>3.710247</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Nicaragua</td>\n",
" <td>10.434992</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Ghana</td>\n",
" <td>9.171567</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Sierra Leone</td>\n",
" <td>21.054804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Peru</td>\n",
" <td>4.051516</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Benin</td>\n",
" <td>8.137597</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>India</td>\n",
" <td>4.391382</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>United States</td>\n",
" <td>34.945144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>China</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Timor-Leste</td>\n",
" <td>9.585492</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Lao PDR</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Somalia</td>\n",
" <td>17.316017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>Kyrgyzstan</td>\n",
" <td>7.112293</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>Samoa</td>\n",
" <td>3.421662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Mozambique</td>\n",
" <td>10.196484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>Brazil</td>\n",
" <td>42.477876</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>Belize</td>\n",
" <td>45.026178</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>Kenya</td>\n",
" <td>12.214316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>Lebanon</td>\n",
" <td>6.509946</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>Dominican Republic</td>\n",
" <td>14.982415</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>Tanzania</td>\n",
" <td>11.312424</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>Botswana</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>Bosnia and Herzegovina</td>\n",
" <td>5.536913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>Haiti</td>\n",
" <td>25.527192</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>Cameroon</td>\n",
" <td>4.695009</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>Papua New Guinea</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>Solomon Islands</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>Bulgaria</td>\n",
" <td>2.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>St Vincent</td>\n",
" <td>72.727273</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>Nepal</td>\n",
" <td>0.397953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>El Salvador</td>\n",
" <td>17.143167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>81</th>\n",
" <td>Egypt</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>82</th>\n",
" <td>Costa Rica</td>\n",
" <td>8.917357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>Congo (Rep.)</td>\n",
" <td>1.198257</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>Burkina Faso</td>\n",
" <td>2.172702</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>South Africa</td>\n",
" <td>3.364486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>Colombia</td>\n",
" <td>21.693634</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>Pakistan</td>\n",
" <td>6.085563</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>Vanuatu</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>Moldova</td>\n",
" <td>8.549223</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>Vietnam</td>\n",
" <td>5.473769</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>Mali</td>\n",
" <td>10.650330</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>92 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
"[Row(status=u'in_repayment')]"
" country bad_loan_pct\n",
"0 Chad 93.220339\n",
"1 Paraguay 2.893067\n",
"2 Yemen 24.980195\n",
"3 Senegal 7.014218\n",
"4 Philippines 2.515182\n",
"5 Turkey 0.233645\n",
"6 Malawi 13.125000\n",
"7 Iraq 35.476593\n",
"8 Cambodia 1.433968\n",
"9 Afghanistan 26.235741\n",
"10 Rwanda 6.648555\n",
"11 Jordan 16.066482\n",
"12 Kosovo 11.893370\n",
"13 Sri Lanka 25.296443\n",
"14 Togo 16.013720\n",
"15 Ecuador 7.329710\n",
"16 Lesotho 4.081633\n",
"17 Albania 9.304468\n",
"18 Madagascar 3.710247\n",
"19 Nicaragua 10.434992\n",
"20 Ghana 9.171567\n",
"21 Sierra Leone 21.054804\n",
"22 Peru 4.051516\n",
"23 Benin 8.137597\n",
"24 India 4.391382\n",
"25 United States 34.945144\n",
"26 China 0.000000\n",
"27 Timor-Leste 9.585492\n",
"28 Lao PDR 0.000000\n",
"29 Somalia 17.316017\n",
".. ... ...\n",
"62 Kyrgyzstan 7.112293\n",
"63 Samoa 3.421662\n",
"64 Mozambique 10.196484\n",
"65 Brazil 42.477876\n",
"66 Belize 45.026178\n",
"67 Kenya 12.214316\n",
"68 Lebanon 6.509946\n",
"69 Dominican Republic 14.982415\n",
"70 Tanzania 11.312424\n",
"71 Botswana 100.000000\n",
"72 Bosnia and Herzegovina 5.536913\n",
"73 Haiti 25.527192\n",
"74 Cameroon 4.695009\n",
"75 Papua New Guinea 0.000000\n",
"76 Solomon Islands 0.000000\n",
"77 Bulgaria 2.333333\n",
"78 St Vincent 72.727273\n",
"79 Nepal 0.397953\n",
"80 El Salvador 17.143167\n",
"81 Egypt 0.000000\n",
"82 Costa Rica 8.917357\n",
"83 Congo (Rep.) 1.198257\n",
"84 Burkina Faso 2.172702\n",
"85 South Africa 3.364486\n",
"86 Colombia 21.693634\n",
"87 Pakistan 6.085563\n",
"88 Vanuatu 0.000000\n",
"89 Moldova 8.549223\n",
"90 Vietnam 5.473769\n",
"91 Mali 10.650330\n",
"\n",
"[92 rows x 2 columns]"
]
},
"execution_count": 19,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"loans.registerTempTable('loans')\n",
"sparkSql.sql('''\n",
"SELECT loans.status\n",
"FROM loans\n",
"LIMIT 1\n",
"''').collect()"
"bad_pct_df = sparkSql.sql('''\n",
"SELECT\n",
" sub.country,\n",
" sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n",
"FROM (SELECT\n",
" SUM(CASE WHEN\n",
" loans.status = 'refunded' OR\n",
" loans.status = 'expired' OR\n",
" loans.status = 'defaulted' OR\n",
" loans.delinquent = True\n",
" THEN 1\n",
" ELSE 0 END) AS bad_loans,\n",
" COUNT(*) AS total_loans,\n",
" loans.location.country\n",
" FROM loans\n",
" GROUP BY\n",
" loans.location.country\n",
") sub\n",
"''').toPandas()\n",
"\n",
"bad_pct_df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>bad_loan_pct</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>Botswana</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>Gaza</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Chad</td>\n",
" <td>93.220339</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>St Vincent</td>\n",
" <td>72.727273</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>Belize</td>\n",
" <td>45.026178</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country bad_loan_pct\n",
"71 Botswana 100.000000\n",
"47 Gaza 100.000000\n",
"0 Chad 93.220339\n",
"78 St Vincent 72.727273\n",
"66 Belize 45.026178"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>bad_loan_pct</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>China</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>Cape Verde</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>Thailand</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>Bangladesh</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>Mauritania</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country bad_loan_pct\n",
"26 China 0\n",
"41 Cape Verde 0\n",
"39 Thailand 0\n",
"38 Bangladesh 0\n",
"37 Mauritania 0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bad_pct_df.sort_values(by='bad_loan_pct').head()"
]
}
],