diff --git a/.ipynb_checkpoints/Kiva Exploration-checkpoint.ipynb b/.ipynb_checkpoints/Kiva Exploration-checkpoint.ipynb
new file mode 100644
index 0000000..d4ed9b4
--- /dev/null
+++ b/.ipynb_checkpoints/Kiva Exploration-checkpoint.ipynb
@@ -0,0 +1,810 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Understanding the Kiva Dataset\n",
+ "\n",
+ "Before we actually get into the work of predicting anything based on the data Kiva makes public, we first want to get a better picture of what the dataset actually looks like.\n",
+ "\n",
+ "Our first step: What is the schema of the data? Spark SQL will make it easy to query data in the future, but we need to know first what is available."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "sparkSql = (SparkSession.builder\n",
+ " .master(\"local\")\n",
+ " .appName(\"Kiva Exploration\")\n",
+ " .getOrCreate())\n",
+ "\n",
+ "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
+ "loans.registerTempTable('loans')\n",
+ "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
+ "lenders.registerTempTable('lenders')\n",
+ "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
+ "loans_lenders.registerTempTable('loans_lenders')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- activity: string (nullable = true)\n",
+ " |-- basket_amount: long (nullable = true)\n",
+ " |-- bonus_credit_eligibility: boolean (nullable = true)\n",
+ " |-- borrowers: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- first_name: string (nullable = true)\n",
+ " | | |-- gender: string (nullable = true)\n",
+ " | | |-- last_name: string (nullable = true)\n",
+ " | | |-- pictured: boolean (nullable = true)\n",
+ " |-- currency_exchange_loss_amount: double (nullable = true)\n",
+ " |-- delinquent: boolean (nullable = true)\n",
+ " |-- description: struct (nullable = true)\n",
+ " | |-- languages: array (nullable = true)\n",
+ " | | |-- element: string (containsNull = true)\n",
+ " | |-- texts: struct (nullable = true)\n",
+ " | | |-- ar: string (nullable = true)\n",
+ " | | |-- en: string (nullable = true)\n",
+ " | | |-- es: string (nullable = true)\n",
+ " | | |-- fr: string (nullable = true)\n",
+ " | | |-- id: string (nullable = true)\n",
+ " | | |-- mn: string (nullable = true)\n",
+ " | | |-- pt: string (nullable = true)\n",
+ " | | |-- ru: string (nullable = true)\n",
+ " | | |-- vi: string (nullable = true)\n",
+ " |-- funded_amount: long (nullable = true)\n",
+ " |-- funded_date: string (nullable = true)\n",
+ " |-- id: long (nullable = true)\n",
+ " |-- image: struct (nullable = true)\n",
+ " | |-- id: long (nullable = true)\n",
+ " | |-- template_id: long (nullable = true)\n",
+ " |-- journal_totals: struct (nullable = true)\n",
+ " | |-- bulkEntries: long (nullable = true)\n",
+ " | |-- entries: long (nullable = true)\n",
+ " |-- lender_count: long (nullable = true)\n",
+ " |-- loan_amount: long (nullable = true)\n",
+ " |-- location: struct (nullable = true)\n",
+ " | |-- country: string (nullable = true)\n",
+ " | |-- country_code: string (nullable = true)\n",
+ " | |-- geo: struct (nullable = true)\n",
+ " | | |-- level: string (nullable = true)\n",
+ " | | |-- pairs: string (nullable = true)\n",
+ " | | |-- type: string (nullable = true)\n",
+ " | |-- town: string (nullable = true)\n",
+ " |-- name: string (nullable = true)\n",
+ " |-- paid_amount: double (nullable = true)\n",
+ " |-- paid_date: string (nullable = true)\n",
+ " |-- partner_id: long (nullable = true)\n",
+ " |-- payments: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- amount: double (nullable = true)\n",
+ " | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
+ " | | |-- local_amount: double (nullable = true)\n",
+ " | | |-- payment_id: long (nullable = true)\n",
+ " | | |-- processed_date: string (nullable = true)\n",
+ " | | |-- rounded_local_amount: double (nullable = true)\n",
+ " | | |-- settlement_date: string (nullable = true)\n",
+ " |-- planned_expiration_date: string (nullable = true)\n",
+ " |-- posted_date: string (nullable = true)\n",
+ " |-- sector: string (nullable = true)\n",
+ " |-- status: string (nullable = true)\n",
+ " |-- tags: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- name: string (nullable = true)\n",
+ " |-- terms: struct (nullable = true)\n",
+ " | |-- disbursal_amount: double (nullable = true)\n",
+ " | |-- disbursal_currency: string (nullable = true)\n",
+ " | |-- disbursal_date: string (nullable = true)\n",
+ " | |-- loan_amount: long (nullable = true)\n",
+ " | |-- local_payments: array (nullable = true)\n",
+ " | | |-- element: struct (containsNull = true)\n",
+ " | | | |-- amount: double (nullable = true)\n",
+ " | | | |-- due_date: string (nullable = true)\n",
+ " | |-- loss_liability: struct (nullable = true)\n",
+ " | | |-- currency_exchange: string (nullable = true)\n",
+ " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
+ " | | |-- nonpayment: string (nullable = true)\n",
+ " | |-- repayment_interval: string (nullable = true)\n",
+ " | |-- repayment_term: long (nullable = true)\n",
+ " | |-- scheduled_payments: array (nullable = true)\n",
+ " | | |-- element: struct (containsNull = true)\n",
+ " | | | |-- amount: double (nullable = true)\n",
+ " | | | |-- due_date: string (nullable = true)\n",
+ " |-- themes: array (nullable = true)\n",
+ " | |-- element: string (containsNull = true)\n",
+ " |-- translator: struct (nullable = true)\n",
+ " | |-- byline: string (nullable = true)\n",
+ " | |-- image: long (nullable = true)\n",
+ " |-- use: string (nullable = true)\n",
+ " |-- video: struct (nullable = true)\n",
+ " | |-- id: long (nullable = true)\n",
+ " | |-- thumbnailImageId: long (nullable = true)\n",
+ " | |-- title: string (nullable = true)\n",
+ " | |-- youtubeId: string (nullable = true)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "loans.printSchema()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Row(status=u'refunded', count=5504),\n",
+ " Row(status=u'defaulted', count=21776),\n",
+ " Row(status=u'in_repayment', count=155749),\n",
+ " Row(status=u'reviewed', count=3),\n",
+ " Row(status=u'deleted', count=2721),\n",
+ " Row(status=u'paid', count=775330),\n",
+ " Row(status=u'issue', count=199),\n",
+ " Row(status=u'inactive_expired', count=12421),\n",
+ " Row(status=u'fundraising', count=3986),\n",
+ " Row(status=u'expired', count=33773),\n",
+ " Row(status=u'inactive', count=2493),\n",
+ " Row(status=u'funded', count=173),\n",
+ " Row(status=u'', count=2)]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loans.groupby(loans.status).count().collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Row(delinquent=None, count=970465), Row(delinquent=True, count=43665)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loans.groupby(loans.delinquent).count().collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Row(status=u'refunded', count=156),\n",
+ " Row(status=u'defaulted', count=20116),\n",
+ " Row(status=u'in_repayment', count=23393)]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "loans.where(loans.delinquent == True).groupby(loans.status).count().collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " bad_loan_pct | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Chad | \n",
+ " 93.220339 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Paraguay | \n",
+ " 2.893067 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Yemen | \n",
+ " 24.980195 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Senegal | \n",
+ " 7.014218 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Philippines | \n",
+ " 2.515182 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Turkey | \n",
+ " 0.233645 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Malawi | \n",
+ " 13.125000 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Iraq | \n",
+ " 35.476593 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Cambodia | \n",
+ " 1.433968 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Afghanistan | \n",
+ " 26.235741 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Rwanda | \n",
+ " 6.648555 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Jordan | \n",
+ " 16.066482 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Kosovo | \n",
+ " 11.893370 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Sri Lanka | \n",
+ " 25.296443 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Togo | \n",
+ " 16.013720 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Ecuador | \n",
+ " 7.329710 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Lesotho | \n",
+ " 4.081633 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Albania | \n",
+ " 9.304468 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Madagascar | \n",
+ " 3.710247 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Nicaragua | \n",
+ " 10.434992 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Ghana | \n",
+ " 9.171567 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Sierra Leone | \n",
+ " 21.054804 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " Peru | \n",
+ " 4.051516 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Benin | \n",
+ " 8.137597 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " India | \n",
+ " 4.391382 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " United States | \n",
+ " 34.945144 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " China | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Timor-Leste | \n",
+ " 9.585492 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Lao PDR | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Somalia | \n",
+ " 17.316017 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " Kyrgyzstan | \n",
+ " 7.112293 | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " Samoa | \n",
+ " 3.421662 | \n",
+ "
\n",
+ " \n",
+ " 64 | \n",
+ " Mozambique | \n",
+ " 10.196484 | \n",
+ "
\n",
+ " \n",
+ " 65 | \n",
+ " Brazil | \n",
+ " 42.477876 | \n",
+ "
\n",
+ " \n",
+ " 66 | \n",
+ " Belize | \n",
+ " 45.026178 | \n",
+ "
\n",
+ " \n",
+ " 67 | \n",
+ " Kenya | \n",
+ " 12.214316 | \n",
+ "
\n",
+ " \n",
+ " 68 | \n",
+ " Lebanon | \n",
+ " 6.509946 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " Dominican Republic | \n",
+ " 14.982415 | \n",
+ "
\n",
+ " \n",
+ " 70 | \n",
+ " Tanzania | \n",
+ " 11.312424 | \n",
+ "
\n",
+ " \n",
+ " 71 | \n",
+ " Botswana | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 72 | \n",
+ " Bosnia and Herzegovina | \n",
+ " 5.536913 | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " Haiti | \n",
+ " 25.527192 | \n",
+ "
\n",
+ " \n",
+ " 74 | \n",
+ " Cameroon | \n",
+ " 4.695009 | \n",
+ "
\n",
+ " \n",
+ " 75 | \n",
+ " Papua New Guinea | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 76 | \n",
+ " Solomon Islands | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 77 | \n",
+ " Bulgaria | \n",
+ " 2.333333 | \n",
+ "
\n",
+ " \n",
+ " 78 | \n",
+ " St Vincent | \n",
+ " 72.727273 | \n",
+ "
\n",
+ " \n",
+ " 79 | \n",
+ " Nepal | \n",
+ " 0.397953 | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " El Salvador | \n",
+ " 17.143167 | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " Egypt | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " Costa Rica | \n",
+ " 8.917357 | \n",
+ "
\n",
+ " \n",
+ " 83 | \n",
+ " Congo (Rep.) | \n",
+ " 1.198257 | \n",
+ "
\n",
+ " \n",
+ " 84 | \n",
+ " Burkina Faso | \n",
+ " 2.172702 | \n",
+ "
\n",
+ " \n",
+ " 85 | \n",
+ " South Africa | \n",
+ " 3.364486 | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " Colombia | \n",
+ " 21.693634 | \n",
+ "
\n",
+ " \n",
+ " 87 | \n",
+ " Pakistan | \n",
+ " 6.085563 | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " Vanuatu | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " Moldova | \n",
+ " 8.549223 | \n",
+ "
\n",
+ " \n",
+ " 90 | \n",
+ " Vietnam | \n",
+ " 5.473769 | \n",
+ "
\n",
+ " \n",
+ " 91 | \n",
+ " Mali | \n",
+ " 10.650330 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
92 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country bad_loan_pct\n",
+ "0 Chad 93.220339\n",
+ "1 Paraguay 2.893067\n",
+ "2 Yemen 24.980195\n",
+ "3 Senegal 7.014218\n",
+ "4 Philippines 2.515182\n",
+ "5 Turkey 0.233645\n",
+ "6 Malawi 13.125000\n",
+ "7 Iraq 35.476593\n",
+ "8 Cambodia 1.433968\n",
+ "9 Afghanistan 26.235741\n",
+ "10 Rwanda 6.648555\n",
+ "11 Jordan 16.066482\n",
+ "12 Kosovo 11.893370\n",
+ "13 Sri Lanka 25.296443\n",
+ "14 Togo 16.013720\n",
+ "15 Ecuador 7.329710\n",
+ "16 Lesotho 4.081633\n",
+ "17 Albania 9.304468\n",
+ "18 Madagascar 3.710247\n",
+ "19 Nicaragua 10.434992\n",
+ "20 Ghana 9.171567\n",
+ "21 Sierra Leone 21.054804\n",
+ "22 Peru 4.051516\n",
+ "23 Benin 8.137597\n",
+ "24 India 4.391382\n",
+ "25 United States 34.945144\n",
+ "26 China 0.000000\n",
+ "27 Timor-Leste 9.585492\n",
+ "28 Lao PDR 0.000000\n",
+ "29 Somalia 17.316017\n",
+ ".. ... ...\n",
+ "62 Kyrgyzstan 7.112293\n",
+ "63 Samoa 3.421662\n",
+ "64 Mozambique 10.196484\n",
+ "65 Brazil 42.477876\n",
+ "66 Belize 45.026178\n",
+ "67 Kenya 12.214316\n",
+ "68 Lebanon 6.509946\n",
+ "69 Dominican Republic 14.982415\n",
+ "70 Tanzania 11.312424\n",
+ "71 Botswana 100.000000\n",
+ "72 Bosnia and Herzegovina 5.536913\n",
+ "73 Haiti 25.527192\n",
+ "74 Cameroon 4.695009\n",
+ "75 Papua New Guinea 0.000000\n",
+ "76 Solomon Islands 0.000000\n",
+ "77 Bulgaria 2.333333\n",
+ "78 St Vincent 72.727273\n",
+ "79 Nepal 0.397953\n",
+ "80 El Salvador 17.143167\n",
+ "81 Egypt 0.000000\n",
+ "82 Costa Rica 8.917357\n",
+ "83 Congo (Rep.) 1.198257\n",
+ "84 Burkina Faso 2.172702\n",
+ "85 South Africa 3.364486\n",
+ "86 Colombia 21.693634\n",
+ "87 Pakistan 6.085563\n",
+ "88 Vanuatu 0.000000\n",
+ "89 Moldova 8.549223\n",
+ "90 Vietnam 5.473769\n",
+ "91 Mali 10.650330\n",
+ "\n",
+ "[92 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_pct_df = sparkSql.sql('''\n",
+ "SELECT\n",
+ " sub.country,\n",
+ " sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n",
+ "FROM (SELECT\n",
+ " SUM(CASE WHEN\n",
+ " loans.status = 'refunded' OR\n",
+ " loans.status = 'expired' OR\n",
+ " loans.status = 'defaulted' OR\n",
+ " loans.delinquent = True\n",
+ " THEN 1\n",
+ " ELSE 0 END) AS bad_loans,\n",
+ " COUNT(*) AS total_loans,\n",
+ " loans.location.country\n",
+ " FROM loans\n",
+ " GROUP BY\n",
+ " loans.location.country\n",
+ ") sub\n",
+ "''').toPandas()\n",
+ "\n",
+ "bad_pct_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " bad_loan_pct | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 71 | \n",
+ " Botswana | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " Gaza | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Chad | \n",
+ " 93.220339 | \n",
+ "
\n",
+ " \n",
+ " 78 | \n",
+ " St Vincent | \n",
+ " 72.727273 | \n",
+ "
\n",
+ " \n",
+ " 66 | \n",
+ " Belize | \n",
+ " 45.026178 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country bad_loan_pct\n",
+ "71 Botswana 100.000000\n",
+ "47 Gaza 100.000000\n",
+ "0 Chad 93.220339\n",
+ "78 St Vincent 72.727273\n",
+ "66 Belize 45.026178"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " bad_loan_pct | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 26 | \n",
+ " China | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " Cape Verde | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " Thailand | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " Bangladesh | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " Mauritania | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country bad_loan_pct\n",
+ "26 China 0\n",
+ "41 Cape Verde 0\n",
+ "39 Thailand 0\n",
+ "38 Bangladesh 0\n",
+ "37 Mauritania 0"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_pct_df.sort_values(by='bad_loan_pct').head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/Kiva Exploration.ipynb b/Kiva Exploration.ipynb
index 7013b7f..d4ed9b4 100644
--- a/Kiva Exploration.ipynb
+++ b/Kiva Exploration.ipynb
@@ -25,8 +25,11 @@
" .getOrCreate())\n",
"\n",
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
+ "loans.registerTempTable('loans')\n",
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
- "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')"
+ "lenders.registerTempTable('lenders')\n",
+ "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
+ "loans_lenders.registerTempTable('loans_lenders')"
]
},
{
@@ -224,29 +227,562 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " bad_loan_pct | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Chad | \n",
+ " 93.220339 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Paraguay | \n",
+ " 2.893067 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Yemen | \n",
+ " 24.980195 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Senegal | \n",
+ " 7.014218 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Philippines | \n",
+ " 2.515182 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Turkey | \n",
+ " 0.233645 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Malawi | \n",
+ " 13.125000 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Iraq | \n",
+ " 35.476593 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Cambodia | \n",
+ " 1.433968 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Afghanistan | \n",
+ " 26.235741 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Rwanda | \n",
+ " 6.648555 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " Jordan | \n",
+ " 16.066482 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Kosovo | \n",
+ " 11.893370 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Sri Lanka | \n",
+ " 25.296443 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " Togo | \n",
+ " 16.013720 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " Ecuador | \n",
+ " 7.329710 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " Lesotho | \n",
+ " 4.081633 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " Albania | \n",
+ " 9.304468 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " Madagascar | \n",
+ " 3.710247 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Nicaragua | \n",
+ " 10.434992 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " Ghana | \n",
+ " 9.171567 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Sierra Leone | \n",
+ " 21.054804 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " Peru | \n",
+ " 4.051516 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Benin | \n",
+ " 8.137597 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " India | \n",
+ " 4.391382 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " United States | \n",
+ " 34.945144 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " China | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " Timor-Leste | \n",
+ " 9.585492 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " Lao PDR | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " Somalia | \n",
+ " 17.316017 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " Kyrgyzstan | \n",
+ " 7.112293 | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " Samoa | \n",
+ " 3.421662 | \n",
+ "
\n",
+ " \n",
+ " 64 | \n",
+ " Mozambique | \n",
+ " 10.196484 | \n",
+ "
\n",
+ " \n",
+ " 65 | \n",
+ " Brazil | \n",
+ " 42.477876 | \n",
+ "
\n",
+ " \n",
+ " 66 | \n",
+ " Belize | \n",
+ " 45.026178 | \n",
+ "
\n",
+ " \n",
+ " 67 | \n",
+ " Kenya | \n",
+ " 12.214316 | \n",
+ "
\n",
+ " \n",
+ " 68 | \n",
+ " Lebanon | \n",
+ " 6.509946 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " Dominican Republic | \n",
+ " 14.982415 | \n",
+ "
\n",
+ " \n",
+ " 70 | \n",
+ " Tanzania | \n",
+ " 11.312424 | \n",
+ "
\n",
+ " \n",
+ " 71 | \n",
+ " Botswana | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 72 | \n",
+ " Bosnia and Herzegovina | \n",
+ " 5.536913 | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " Haiti | \n",
+ " 25.527192 | \n",
+ "
\n",
+ " \n",
+ " 74 | \n",
+ " Cameroon | \n",
+ " 4.695009 | \n",
+ "
\n",
+ " \n",
+ " 75 | \n",
+ " Papua New Guinea | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 76 | \n",
+ " Solomon Islands | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 77 | \n",
+ " Bulgaria | \n",
+ " 2.333333 | \n",
+ "
\n",
+ " \n",
+ " 78 | \n",
+ " St Vincent | \n",
+ " 72.727273 | \n",
+ "
\n",
+ " \n",
+ " 79 | \n",
+ " Nepal | \n",
+ " 0.397953 | \n",
+ "
\n",
+ " \n",
+ " 80 | \n",
+ " El Salvador | \n",
+ " 17.143167 | \n",
+ "
\n",
+ " \n",
+ " 81 | \n",
+ " Egypt | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 82 | \n",
+ " Costa Rica | \n",
+ " 8.917357 | \n",
+ "
\n",
+ " \n",
+ " 83 | \n",
+ " Congo (Rep.) | \n",
+ " 1.198257 | \n",
+ "
\n",
+ " \n",
+ " 84 | \n",
+ " Burkina Faso | \n",
+ " 2.172702 | \n",
+ "
\n",
+ " \n",
+ " 85 | \n",
+ " South Africa | \n",
+ " 3.364486 | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " Colombia | \n",
+ " 21.693634 | \n",
+ "
\n",
+ " \n",
+ " 87 | \n",
+ " Pakistan | \n",
+ " 6.085563 | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " Vanuatu | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 89 | \n",
+ " Moldova | \n",
+ " 8.549223 | \n",
+ "
\n",
+ " \n",
+ " 90 | \n",
+ " Vietnam | \n",
+ " 5.473769 | \n",
+ "
\n",
+ " \n",
+ " 91 | \n",
+ " Mali | \n",
+ " 10.650330 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
92 rows × 2 columns
\n",
+ "
"
+ ],
"text/plain": [
- "[Row(status=u'in_repayment')]"
+ " country bad_loan_pct\n",
+ "0 Chad 93.220339\n",
+ "1 Paraguay 2.893067\n",
+ "2 Yemen 24.980195\n",
+ "3 Senegal 7.014218\n",
+ "4 Philippines 2.515182\n",
+ "5 Turkey 0.233645\n",
+ "6 Malawi 13.125000\n",
+ "7 Iraq 35.476593\n",
+ "8 Cambodia 1.433968\n",
+ "9 Afghanistan 26.235741\n",
+ "10 Rwanda 6.648555\n",
+ "11 Jordan 16.066482\n",
+ "12 Kosovo 11.893370\n",
+ "13 Sri Lanka 25.296443\n",
+ "14 Togo 16.013720\n",
+ "15 Ecuador 7.329710\n",
+ "16 Lesotho 4.081633\n",
+ "17 Albania 9.304468\n",
+ "18 Madagascar 3.710247\n",
+ "19 Nicaragua 10.434992\n",
+ "20 Ghana 9.171567\n",
+ "21 Sierra Leone 21.054804\n",
+ "22 Peru 4.051516\n",
+ "23 Benin 8.137597\n",
+ "24 India 4.391382\n",
+ "25 United States 34.945144\n",
+ "26 China 0.000000\n",
+ "27 Timor-Leste 9.585492\n",
+ "28 Lao PDR 0.000000\n",
+ "29 Somalia 17.316017\n",
+ ".. ... ...\n",
+ "62 Kyrgyzstan 7.112293\n",
+ "63 Samoa 3.421662\n",
+ "64 Mozambique 10.196484\n",
+ "65 Brazil 42.477876\n",
+ "66 Belize 45.026178\n",
+ "67 Kenya 12.214316\n",
+ "68 Lebanon 6.509946\n",
+ "69 Dominican Republic 14.982415\n",
+ "70 Tanzania 11.312424\n",
+ "71 Botswana 100.000000\n",
+ "72 Bosnia and Herzegovina 5.536913\n",
+ "73 Haiti 25.527192\n",
+ "74 Cameroon 4.695009\n",
+ "75 Papua New Guinea 0.000000\n",
+ "76 Solomon Islands 0.000000\n",
+ "77 Bulgaria 2.333333\n",
+ "78 St Vincent 72.727273\n",
+ "79 Nepal 0.397953\n",
+ "80 El Salvador 17.143167\n",
+ "81 Egypt 0.000000\n",
+ "82 Costa Rica 8.917357\n",
+ "83 Congo (Rep.) 1.198257\n",
+ "84 Burkina Faso 2.172702\n",
+ "85 South Africa 3.364486\n",
+ "86 Colombia 21.693634\n",
+ "87 Pakistan 6.085563\n",
+ "88 Vanuatu 0.000000\n",
+ "89 Moldova 8.549223\n",
+ "90 Vietnam 5.473769\n",
+ "91 Mali 10.650330\n",
+ "\n",
+ "[92 rows x 2 columns]"
]
},
- "execution_count": 19,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "loans.registerTempTable('loans')\n",
- "sparkSql.sql('''\n",
- "SELECT loans.status\n",
- "FROM loans\n",
- "LIMIT 1\n",
- "''').collect()"
+ "bad_pct_df = sparkSql.sql('''\n",
+ "SELECT\n",
+ " sub.country,\n",
+ " sub.bad_loans / sub.total_loans * 100.0 as bad_loan_pct\n",
+ "FROM (SELECT\n",
+ " SUM(CASE WHEN\n",
+ " loans.status = 'refunded' OR\n",
+ " loans.status = 'expired' OR\n",
+ " loans.status = 'defaulted' OR\n",
+ " loans.delinquent = True\n",
+ " THEN 1\n",
+ " ELSE 0 END) AS bad_loans,\n",
+ " COUNT(*) AS total_loans,\n",
+ " loans.location.country\n",
+ " FROM loans\n",
+ " GROUP BY\n",
+ " loans.location.country\n",
+ ") sub\n",
+ "''').toPandas()\n",
+ "\n",
+ "bad_pct_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " bad_loan_pct | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 71 | \n",
+ " Botswana | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " Gaza | \n",
+ " 100.000000 | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " Chad | \n",
+ " 93.220339 | \n",
+ "
\n",
+ " \n",
+ " 78 | \n",
+ " St Vincent | \n",
+ " 72.727273 | \n",
+ "
\n",
+ " \n",
+ " 66 | \n",
+ " Belize | \n",
+ " 45.026178 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country bad_loan_pct\n",
+ "71 Botswana 100.000000\n",
+ "47 Gaza 100.000000\n",
+ "0 Chad 93.220339\n",
+ "78 St Vincent 72.727273\n",
+ "66 Belize 45.026178"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_pct_df.sort_values(by='bad_loan_pct', ascending=False).head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " bad_loan_pct | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 26 | \n",
+ " China | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " Cape Verde | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 39 | \n",
+ " Thailand | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 38 | \n",
+ " Bangladesh | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 37 | \n",
+ " Mauritania | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country bad_loan_pct\n",
+ "26 China 0\n",
+ "41 Cape Verde 0\n",
+ "39 Thailand 0\n",
+ "38 Bangladesh 0\n",
+ "37 Mauritania 0"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bad_pct_df.sort_values(by='bad_loan_pct').head()"
]
}
],