diff --git a/.ipynb_checkpoints/Kiva Exploration-checkpoint.ipynb b/.ipynb_checkpoints/Kiva Exploration-checkpoint.ipynb new file mode 100644 index 0000000..d4ed9b4 --- /dev/null +++ b/.ipynb_checkpoints/Kiva Exploration-checkpoint.ipynb @@ -0,0 +1,810 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Understanding the Kiva Dataset\n", + "\n", + "Before we actually get into the work of predicting anything based on the data Kiva makes public, we first want to get a better picture of what the dataset actually looks like.\n", + "\n", + "Our first step: What is the schema of the data? Spark SQL will make it easy to query data in the future, but we need to know first what is available." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sparkSql = (SparkSession.builder\n", + " .master(\"local\")\n", + " .appName(\"Kiva Exploration\")\n", + " .getOrCreate())\n", + "\n", + "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", + "loans.registerTempTable('loans')\n", + "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", + "lenders.registerTempTable('lenders')\n", + "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n", + "loans_lenders.registerTempTable('loans_lenders')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- activity: string (nullable = true)\n", + " |-- basket_amount: long (nullable = true)\n", + " |-- bonus_credit_eligibility: boolean (nullable = true)\n", + " |-- borrowers: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- first_name: string (nullable = true)\n", + " | | |-- gender: string (nullable = true)\n", + " | | |-- last_name: string (nullable = true)\n", + " | | |-- pictured: boolean (nullable = true)\n", + " |-- currency_exchange_loss_amount: double (nullable = true)\n", + " |-- delinquent: boolean (nullable = true)\n", + " |-- description: struct (nullable = true)\n", + " | |-- languages: array (nullable = true)\n", + " | | |-- element: string (containsNull = true)\n", + " | |-- texts: struct (nullable = true)\n", + " | | |-- ar: string (nullable = true)\n", + " | | |-- en: string (nullable = true)\n", + " | | |-- es: string (nullable = true)\n", + " | | |-- fr: string (nullable = true)\n", + " | | |-- id: string (nullable = true)\n", + " | | |-- mn: string (nullable = true)\n", + " | | |-- pt: string (nullable = true)\n", + " | | |-- ru: string (nullable = true)\n", + " | | |-- vi: string (nullable = true)\n", + " |-- funded_amount: long (nullable = true)\n", + " |-- funded_date: string (nullable = true)\n", + " |-- id: long (nullable = true)\n", + " |-- image: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- template_id: long (nullable = true)\n", + " |-- journal_totals: struct (nullable = true)\n", + " | |-- bulkEntries: long (nullable = true)\n", + " | |-- entries: long (nullable = true)\n", + " |-- lender_count: long (nullable = true)\n", + " |-- loan_amount: long (nullable = true)\n", + " |-- location: struct (nullable = true)\n", + " | |-- country: string (nullable = true)\n", + " | |-- country_code: string (nullable = true)\n", + " | |-- geo: struct (nullable = true)\n", + " | | |-- level: string (nullable = true)\n", + " | | |-- pairs: string (nullable = true)\n", + " | | |-- type: string (nullable = true)\n", + " | |-- town: string (nullable = true)\n", + " |-- name: string (nullable = true)\n", + " |-- paid_amount: double (nullable = true)\n", + " |-- paid_date: string (nullable = true)\n", + " |-- partner_id: long (nullable = true)\n", + " |-- payments: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- amount: double (nullable = true)\n", + " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", + " | | |-- local_amount: double (nullable = true)\n", + " | | |-- payment_id: long (nullable = true)\n", + " | | |-- processed_date: string (nullable = true)\n", + " | | |-- rounded_local_amount: double (nullable = true)\n", + " | | |-- settlement_date: string (nullable = true)\n", + " |-- planned_expiration_date: string (nullable = true)\n", + " |-- posted_date: string (nullable = true)\n", + " |-- sector: string (nullable = true)\n", + " |-- status: string (nullable = true)\n", + " |-- tags: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- name: string (nullable = true)\n", + " |-- terms: struct (nullable = true)\n", + " | |-- disbursal_amount: double (nullable = true)\n", + " | |-- disbursal_currency: string (nullable = true)\n", + " | |-- disbursal_date: string (nullable = true)\n", + " | |-- loan_amount: long (nullable = true)\n", + " | |-- local_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " | |-- loss_liability: struct (nullable = true)\n", + " | | |-- currency_exchange: string (nullable = true)\n", + " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", + " | | |-- nonpayment: string (nullable = true)\n", + " | |-- repayment_interval: string (nullable = true)\n", + " | |-- repayment_term: long (nullable = true)\n", + " | |-- scheduled_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " |-- themes: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- translator: struct (nullable = true)\n", + " | |-- byline: string (nullable = true)\n", + " | |-- image: long (nullable = true)\n", + " |-- use: string (nullable = true)\n", + " |-- video: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- thumbnailImageId: long (nullable = true)\n", + " | |-- title: string (nullable = true)\n", + " | |-- youtubeId: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "loans.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status=u'refunded', count=5504),\n", + " Row(status=u'defaulted', count=21776),\n", + " Row(status=u'in_repayment', count=155749),\n", + " Row(status=u'reviewed', count=3),\n", + " Row(status=u'deleted', count=2721),\n", + " Row(status=u'paid', count=775330),\n", + " Row(status=u'issue', count=199),\n", + " Row(status=u'inactive_expired', count=12421),\n", + " Row(status=u'fundraising', count=3986),\n", + " Row(status=u'expired', count=33773),\n", + " Row(status=u'inactive', count=2493),\n", + " Row(status=u'funded', count=173),\n", + " Row(status=u'', count=2)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.groupby(loans.status).count().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(delinquent=None, count=970465), Row(delinquent=True, count=43665)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.groupby(loans.delinquent).count().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status=u'refunded', count=156),\n", + " Row(status=u'defaulted', count=20116),\n", + " Row(status=u'in_repayment', count=23393)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.where(loans.delinquent == True).groupby(loans.status).count().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybad_loan_pct
0Chad93.220339
1Paraguay2.893067
2Yemen24.980195
3Senegal7.014218
4Philippines2.515182
5Turkey0.233645
6Malawi13.125000
7Iraq35.476593
8Cambodia1.433968
9Afghanistan26.235741
10Rwanda6.648555
11Jordan16.066482
12Kosovo11.893370
13Sri Lanka25.296443
14Togo16.013720
15Ecuador7.329710
16Lesotho4.081633
17Albania9.304468
18Madagascar3.710247
19Nicaragua10.434992
20Ghana9.171567
21Sierra Leone21.054804
22Peru4.051516
23Benin8.137597
24India4.391382
25United States34.945144
26China0.000000
27Timor-Leste9.585492
28Lao PDR0.000000
29Somalia17.316017
.........
62Kyrgyzstan7.112293
63Samoa3.421662
64Mozambique10.196484
65Brazil42.477876
66Belize45.026178
67Kenya12.214316
68Lebanon6.509946
69Dominican Republic14.982415
70Tanzania11.312424
71Botswana100.000000
72Bosnia and Herzegovina5.536913
73Haiti25.527192
74Cameroon4.695009
75Papua New Guinea0.000000
76Solomon Islands0.000000
77Bulgaria2.333333
78St Vincent72.727273
79Nepal0.397953
80El Salvador17.143167
81Egypt0.000000
82Costa Rica8.917357
83Congo (Rep.)1.198257
84Burkina Faso2.172702
85South Africa3.364486
86Colombia21.693634
87Pakistan6.085563
88Vanuatu0.000000
89Moldova8.549223
90Vietnam5.473769
91Mali10.650330
\n", + "

92 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " country bad_loan_pct\n", + "0 Chad 93.220339\n", + "1 Paraguay 2.893067\n", + "2 Yemen 24.980195\n", + "3 Senegal 7.014218\n", + "4 Philippines 2.515182\n", + "5 Turkey 0.233645\n", + "6 Malawi 13.125000\n", + "7 Iraq 35.476593\n", + "8 Cambodia 1.433968\n", + "9 Afghanistan 26.235741\n", + "10 Rwanda 6.648555\n", + "11 Jordan 16.066482\n", + "12 Kosovo 11.893370\n", + "13 Sri Lanka 25.296443\n", + "14 Togo 16.013720\n", + "15 Ecuador 7.329710\n", + "16 Lesotho 4.081633\n", + "17 Albania 9.304468\n", + "18 Madagascar 3.710247\n", + "19 Nicaragua 10.434992\n", + "20 Ghana 9.171567\n", + "21 Sierra Leone 21.054804\n", + "22 Peru 4.051516\n", + "23 Benin 8.137597\n", + "24 India 4.391382\n", + "25 United States 34.945144\n", + "26 China 0.000000\n", + "27 Timor-Leste 9.585492\n", + "28 Lao PDR 0.000000\n", + "29 Somalia 17.316017\n", + ".. ... ...\n", + "62 Kyrgyzstan 7.112293\n", + "63 Samoa 3.421662\n", + "64 Mozambique 10.196484\n", + "65 Brazil 42.477876\n", + "66 Belize 45.026178\n", + "67 Kenya 12.214316\n", + "68 Lebanon 6.509946\n", + "69 Dominican Republic 14.982415\n", + "70 Tanzania 11.312424\n", + "71 Botswana 100.000000\n", + "72 Bosnia and Herzegovina 5.536913\n", + "73 Haiti 25.527192\n", + "74 Cameroon 4.695009\n", + "75 Papua New Guinea 0.000000\n", + "76 Solomon Islands 0.000000\n", + "77 Bulgaria 2.333333\n", + "78 St Vincent 72.727273\n", + "79 Nepal 0.397953\n", + "80 El Salvador 17.143167\n", + "81 Egypt 0.000000\n", + "82 Costa Rica 8.917357\n", + "83 Congo (Rep.) 1.198257\n", + "84 Burkina Faso 2.172702\n", + "85 South Africa 3.364486\n", + "86 Colombia 21.693634\n", + "87 Pakistan 6.085563\n", + "88 Vanuatu 0.000000\n", + "89 Moldova 8.549223\n", + "90 Vietnam 5.473769\n", + "91 Mali 10.650330\n", + "\n", + "[92 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bad_pct_df = sparkSql.sql('''\n", + "SELECT\n", + " sub.country,\n", + " sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n", + "FROM (SELECT\n", + " SUM(CASE WHEN\n", + " loans.status = 'refunded' OR\n", + " loans.status = 'expired' OR\n", + " loans.status = 'defaulted' OR\n", + " loans.delinquent = True\n", + " THEN 1\n", + " ELSE 0 END) AS bad_loans,\n", + " COUNT(*) AS total_loans,\n", + " loans.location.country\n", + " FROM loans\n", + " GROUP BY\n", + " loans.location.country\n", + ") sub\n", + "''').toPandas()\n", + "\n", + "bad_pct_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybad_loan_pct
71Botswana100.000000
47Gaza100.000000
0Chad93.220339
78St Vincent72.727273
66Belize45.026178
\n", + "
" + ], + "text/plain": [ + " country bad_loan_pct\n", + "71 Botswana 100.000000\n", + "47 Gaza 100.000000\n", + "0 Chad 93.220339\n", + "78 St Vincent 72.727273\n", + "66 Belize 45.026178" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybad_loan_pct
26China0
41Cape Verde0
39Thailand0
38Bangladesh0
37Mauritania0
\n", + "
" + ], + "text/plain": [ + " country bad_loan_pct\n", + "26 China 0\n", + "41 Cape Verde 0\n", + "39 Thailand 0\n", + "38 Bangladesh 0\n", + "37 Mauritania 0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bad_pct_df.sort_values(by='bad_loan_pct').head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/Kiva Exploration.ipynb b/Kiva Exploration.ipynb index 7013b7f..d4ed9b4 100644 --- a/Kiva Exploration.ipynb +++ b/Kiva Exploration.ipynb @@ -25,8 +25,11 @@ " .getOrCreate())\n", "\n", "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", + "loans.registerTempTable('loans')\n", "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", - "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')" + "lenders.registerTempTable('lenders')\n", + "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n", + "loans_lenders.registerTempTable('loans_lenders')" ] }, { @@ -224,29 +227,562 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybad_loan_pct
0Chad93.220339
1Paraguay2.893067
2Yemen24.980195
3Senegal7.014218
4Philippines2.515182
5Turkey0.233645
6Malawi13.125000
7Iraq35.476593
8Cambodia1.433968
9Afghanistan26.235741
10Rwanda6.648555
11Jordan16.066482
12Kosovo11.893370
13Sri Lanka25.296443
14Togo16.013720
15Ecuador7.329710
16Lesotho4.081633
17Albania9.304468
18Madagascar3.710247
19Nicaragua10.434992
20Ghana9.171567
21Sierra Leone21.054804
22Peru4.051516
23Benin8.137597
24India4.391382
25United States34.945144
26China0.000000
27Timor-Leste9.585492
28Lao PDR0.000000
29Somalia17.316017
.........
62Kyrgyzstan7.112293
63Samoa3.421662
64Mozambique10.196484
65Brazil42.477876
66Belize45.026178
67Kenya12.214316
68Lebanon6.509946
69Dominican Republic14.982415
70Tanzania11.312424
71Botswana100.000000
72Bosnia and Herzegovina5.536913
73Haiti25.527192
74Cameroon4.695009
75Papua New Guinea0.000000
76Solomon Islands0.000000
77Bulgaria2.333333
78St Vincent72.727273
79Nepal0.397953
80El Salvador17.143167
81Egypt0.000000
82Costa Rica8.917357
83Congo (Rep.)1.198257
84Burkina Faso2.172702
85South Africa3.364486
86Colombia21.693634
87Pakistan6.085563
88Vanuatu0.000000
89Moldova8.549223
90Vietnam5.473769
91Mali10.650330
\n", + "

92 rows × 2 columns

\n", + "
" + ], "text/plain": [ - "[Row(status=u'in_repayment')]" + " country bad_loan_pct\n", + "0 Chad 93.220339\n", + "1 Paraguay 2.893067\n", + "2 Yemen 24.980195\n", + "3 Senegal 7.014218\n", + "4 Philippines 2.515182\n", + "5 Turkey 0.233645\n", + "6 Malawi 13.125000\n", + "7 Iraq 35.476593\n", + "8 Cambodia 1.433968\n", + "9 Afghanistan 26.235741\n", + "10 Rwanda 6.648555\n", + "11 Jordan 16.066482\n", + "12 Kosovo 11.893370\n", + "13 Sri Lanka 25.296443\n", + "14 Togo 16.013720\n", + "15 Ecuador 7.329710\n", + "16 Lesotho 4.081633\n", + "17 Albania 9.304468\n", + "18 Madagascar 3.710247\n", + "19 Nicaragua 10.434992\n", + "20 Ghana 9.171567\n", + "21 Sierra Leone 21.054804\n", + "22 Peru 4.051516\n", + "23 Benin 8.137597\n", + "24 India 4.391382\n", + "25 United States 34.945144\n", + "26 China 0.000000\n", + "27 Timor-Leste 9.585492\n", + "28 Lao PDR 0.000000\n", + "29 Somalia 17.316017\n", + ".. ... ...\n", + "62 Kyrgyzstan 7.112293\n", + "63 Samoa 3.421662\n", + "64 Mozambique 10.196484\n", + "65 Brazil 42.477876\n", + "66 Belize 45.026178\n", + "67 Kenya 12.214316\n", + "68 Lebanon 6.509946\n", + "69 Dominican Republic 14.982415\n", + "70 Tanzania 11.312424\n", + "71 Botswana 100.000000\n", + "72 Bosnia and Herzegovina 5.536913\n", + "73 Haiti 25.527192\n", + "74 Cameroon 4.695009\n", + "75 Papua New Guinea 0.000000\n", + "76 Solomon Islands 0.000000\n", + "77 Bulgaria 2.333333\n", + "78 St Vincent 72.727273\n", + "79 Nepal 0.397953\n", + "80 El Salvador 17.143167\n", + "81 Egypt 0.000000\n", + "82 Costa Rica 8.917357\n", + "83 Congo (Rep.) 1.198257\n", + "84 Burkina Faso 2.172702\n", + "85 South Africa 3.364486\n", + "86 Colombia 21.693634\n", + "87 Pakistan 6.085563\n", + "88 Vanuatu 0.000000\n", + "89 Moldova 8.549223\n", + "90 Vietnam 5.473769\n", + "91 Mali 10.650330\n", + "\n", + "[92 rows x 2 columns]" ] }, - "execution_count": 19, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "loans.registerTempTable('loans')\n", - "sparkSql.sql('''\n", - "SELECT loans.status\n", - "FROM loans\n", - "LIMIT 1\n", - "''').collect()" + "bad_pct_df = sparkSql.sql('''\n", + "SELECT\n", + " sub.country,\n", + " sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n", + "FROM (SELECT\n", + " SUM(CASE WHEN\n", + " loans.status = 'refunded' OR\n", + " loans.status = 'expired' OR\n", + " loans.status = 'defaulted' OR\n", + " loans.delinquent = True\n", + " THEN 1\n", + " ELSE 0 END) AS bad_loans,\n", + " COUNT(*) AS total_loans,\n", + " loans.location.country\n", + " FROM loans\n", + " GROUP BY\n", + " loans.location.country\n", + ") sub\n", + "''').toPandas()\n", + "\n", + "bad_pct_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybad_loan_pct
71Botswana100.000000
47Gaza100.000000
0Chad93.220339
78St Vincent72.727273
66Belize45.026178
\n", + "
" + ], + "text/plain": [ + " country bad_loan_pct\n", + "71 Botswana 100.000000\n", + "47 Gaza 100.000000\n", + "0 Chad 93.220339\n", + "78 St Vincent 72.727273\n", + "66 Belize 45.026178" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybad_loan_pct
26China0
41Cape Verde0
39Thailand0
38Bangladesh0
37Mauritania0
\n", + "
" + ], + "text/plain": [ + " country bad_loan_pct\n", + "26 China 0\n", + "41 Cape Verde 0\n", + "39 Thailand 0\n", + "38 Bangladesh 0\n", + "37 Mauritania 0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bad_pct_df.sort_values(by='bad_loan_pct').head()" ] } ],