From dbb8843c0262a2ce3ea7d45f8ca3f1a79b9577c4 Mon Sep 17 00:00:00 2001 From: karlloic Date: Wed, 30 Nov 2016 12:26:38 -0500 Subject: [PATCH] Attempt to predict using svm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Differences with “Default Prediction” - Added “extract_tags” UDF - Just did train test split for now - Prediction using svm --- Basic Prediction.ipynb | 3726 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3726 insertions(+) create mode 100644 Basic Prediction.ipynb diff --git a/Basic Prediction.ipynb b/Basic Prediction.ipynb new file mode 100644 index 0000000..c47488b --- /dev/null +++ b/Basic Prediction.ipynb @@ -0,0 +1,3726 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pyspark.sql import SQLContext\n", + "\n", + "LOCAL_PATH = 'file:///Users/Karl-Loic/Documents/Columbia-University/Fall-2016/Big-Data-Analytics/final-project/kiva-dig/'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sql_ctx = SQLContext(sc)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# TODO: Create permanent database tables for reuse\n", + "# Remove previously created metastore DB\n", + "import os, shutil\n", + "\n", + "if os.path.exists(LOCAL_PATH + 'metastore_db/'):\n", + " shutil.rmtree('metastore_db/')\n", + "loans = sql_ctx.read.json(LOCAL_PATH + 'kiva-data/loans.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1014130" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- activity: string (nullable = true)\n", + " |-- basket_amount: long (nullable = true)\n", + " |-- bonus_credit_eligibility: boolean (nullable = true)\n", + " |-- borrowers: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- first_name: string (nullable = true)\n", + " | | |-- gender: string (nullable = true)\n", + " | | |-- last_name: string (nullable = true)\n", + " | | |-- pictured: boolean (nullable = true)\n", + " |-- currency_exchange_loss_amount: double (nullable = true)\n", + " |-- delinquent: boolean (nullable = true)\n", + " |-- description: struct (nullable = true)\n", + " | |-- languages: array (nullable = true)\n", + " | | |-- element: string (containsNull = true)\n", + " | |-- texts: struct (nullable = true)\n", + " | | |-- ar: string (nullable = true)\n", + " | | |-- en: string (nullable = true)\n", + " | | |-- es: string (nullable = true)\n", + " | | |-- fr: string (nullable = true)\n", + " | | |-- id: string (nullable = true)\n", + " | | |-- mn: string (nullable = true)\n", + " | | |-- pt: string (nullable = true)\n", + " | | |-- ru: string (nullable = true)\n", + " | | |-- vi: string (nullable = true)\n", + " |-- funded_amount: long (nullable = true)\n", + " |-- funded_date: string (nullable = true)\n", + " |-- id: long (nullable = true)\n", + " |-- image: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- template_id: long (nullable = true)\n", + " |-- journal_totals: struct (nullable = true)\n", + " | |-- bulkEntries: long (nullable = true)\n", + " | |-- entries: long (nullable = true)\n", + " |-- lender_count: long (nullable = true)\n", + " |-- loan_amount: long (nullable = true)\n", + " |-- location: struct (nullable = true)\n", + " | |-- country: string (nullable = true)\n", + " | |-- country_code: string (nullable = true)\n", + " | |-- geo: struct (nullable = true)\n", + " | | |-- level: string (nullable = true)\n", + " | | |-- pairs: string (nullable = true)\n", + " | | |-- type: string (nullable = true)\n", + " | |-- town: string (nullable = true)\n", + " |-- name: string (nullable = true)\n", + " |-- paid_amount: double (nullable = true)\n", + " |-- paid_date: string (nullable = true)\n", + " |-- partner_id: long (nullable = true)\n", + " |-- payments: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- amount: double (nullable = true)\n", + " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", + " | | |-- local_amount: double (nullable = true)\n", + " | | |-- payment_id: long (nullable = true)\n", + " | | |-- processed_date: string (nullable = true)\n", + " | | |-- rounded_local_amount: double (nullable = true)\n", + " | | |-- settlement_date: string (nullable = true)\n", + " |-- planned_expiration_date: string (nullable = true)\n", + " |-- posted_date: string (nullable = true)\n", + " |-- sector: string (nullable = true)\n", + " |-- status: string (nullable = true)\n", + " |-- tags: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- name: string (nullable = true)\n", + " |-- terms: struct (nullable = true)\n", + " | |-- disbursal_amount: double (nullable = true)\n", + " | |-- disbursal_currency: string (nullable = true)\n", + " | |-- disbursal_date: string (nullable = true)\n", + " | |-- loan_amount: long (nullable = true)\n", + " | |-- local_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " | |-- loss_liability: struct (nullable = true)\n", + " | | |-- currency_exchange: string (nullable = true)\n", + " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", + " | | |-- nonpayment: string (nullable = true)\n", + " | |-- repayment_interval: string (nullable = true)\n", + " | |-- repayment_term: long (nullable = true)\n", + " | |-- scheduled_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " |-- themes: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- translator: struct (nullable = true)\n", + " | |-- byline: string (nullable = true)\n", + " | |-- image: long (nullable = true)\n", + " |-- use: string (nullable = true)\n", + " |-- video: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- thumbnailImageId: long (nullable = true)\n", + " | |-- title: string (nullable = true)\n", + " | |-- youtubeId: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "loans.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(activity=u'Clothing Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=None, use=u'Compra de ropa dama, caballero y ni\\xf1o.', tags=[], status=u'deleted', delinquent=None),\n", + " Row(activity=u'Food Production/Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-03-15T04:52:22Z', use=u'Working capital', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Food Production/Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-05-17T22:02:42Z', use=u'Sell bread at the local market', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Clothing Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-07-20T16:35:52Z', use=u'To buy more clothes and baby things for sale', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Livestock', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-09-20T02:55:18Z', use=u'Lohoure Odio will use the loan to purchase pigs for meat sales', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Construction', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-11-02T23:38:36Z', use=u'buy a welding machine', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Cereals', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-12-13T18:22:55Z', use=u'To buy rice for resale', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Taxi', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'1.0', disbursal_date=u'2008-01-11T20:44:52Z', use=u'Taxi repairs and repainting', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Used Clothing', basket_amount=None, num_borrowers=4, male_proportion(borrowers)=u'0.0', disbursal_date=u'2008-04-19T14:11:32Z', use=u'to invest in their businesses', tags=[], status=u'paid', delinquent=None),\n", + " Row(activity=u'Home Products Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2008-03-10T22:53:27Z', use=u'Edith needs a loan of 1000 soles (3 soles = $1), which she will use to buy more kitchen appliances.', tags=[], status=u'paid', delinquent=None)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.registerTempTable(\"loans\")\n", + "\n", + "def male_proportion(borrowers):\n", + " num_males = 0\n", + " for item in borrowers:\n", + " if item.gender == 'M':\n", + " num_males += 1\n", + " \n", + " return num_males/ float(len(borrowers))\n", + "\n", + "sql_ctx.registerFunction('male_proportion', male_proportion)\n", + "\n", + "bad_loans = sql_ctx.sql(\"\"\"\n", + "SELECT \n", + " activity, \n", + " basket_amount,\n", + " size(borrowers) as num_borrowers,\n", + " male_proportion(borrowers),\n", + " terms.disbursal_date,\n", + " \n", + " use,\n", + " tags,\n", + " status,\n", + " delinquent\n", + "FROM loans\n", + "LIMIT 10\n", + "\"\"\")\n", + "\n", + "bad_loans.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status=u'refunded', count(1)=5504),\n", + " Row(status=u'defaulted', count(1)=21776),\n", + " Row(status=u'in_repayment', count(1)=155748),\n", + " Row(status=u'reviewed', count(1)=3),\n", + " Row(status=u'deleted', count(1)=2722),\n", + " Row(status=u'paid', count(1)=775330),\n", + " Row(status=u'issue', count(1)=199),\n", + " Row(status=u'inactive_expired', count(1)=12421),\n", + " Row(status=u'fundraising', count(1)=3986),\n", + " Row(status=u'expired', count(1)=33773)]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql_ctx.sql(\"\"\"\n", + "SELECT\n", + " status,\n", + " COUNT(*)\n", + "FROM loans\n", + "GROUP BY status\n", + "LIMIT 10\n", + "\"\"\").collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Starts Here - GDP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Datasets sources\n", + "1. Country codes - https://github.com/datasets/country-codes/tree/master/data\n", + "2. GDP Data - http://data.worldbank.org/indicator/NY.GDP.MKTP.CD" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from datetime import datetime\n", + "import numpy as np\n", + "import math\n", + "\n", + "\n", + "# Load country info data\n", + "country_codes_raw = pd.read_csv(LOCAL_PATH + 'economic-data/country-codes.csv')\n", + "country_gdp_raw = pd.read_csv(LOCAL_PATH + 'economic-data/country-gdp.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameofficial_name_enofficial_name_frISO3166-1-Alpha-2ISO3166-1-Alpha-3ISO3166-1-numericITUMARCWMODS...ISO4217-currency_minor_unitISO4217-currency_nameISO4217-currency_numeric_codeis_independentCapitalContinentTLDLanguagesgeonameidEDGAR
0NaNChannel IslandsÎles Anglo-NormandesNaNNaN830NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNSarkSercqNaNNaN680NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2AfghanistanAfghanistanAfghanistanAFAFG4AFGafAFAFG...2.0Afghani971.0YesKabulAS.affa-AF,ps,uz-AF,tk1149361.0B2
3AlbaniaAlbaniaAlbanieALALB8ALBaaABAL...2.0Lek8.0YesTiranaEU.alsq,el783754.0B3
4AlgeriaAlgeriaAlgérieDZDZA12ALGaeALDZ...2.0Algerian Dinar12.0YesAlgiersAF.dzar-DZ2589581.0B4
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " name official_name_en official_name_fr ISO3166-1-Alpha-2 \\\n", + "0 NaN Channel Islands Îles Anglo-Normandes NaN \n", + "1 NaN Sark Sercq NaN \n", + "2 Afghanistan Afghanistan Afghanistan AF \n", + "3 Albania Albania Albanie AL \n", + "4 Algeria Algeria Algérie DZ \n", + "\n", + " ISO3166-1-Alpha-3 ISO3166-1-numeric ITU MARC WMO DS ... \\\n", + "0 NaN 830 NaN NaN NaN NaN ... \n", + "1 NaN 680 NaN NaN NaN NaN ... \n", + "2 AFG 4 AFG af AF AFG ... \n", + "3 ALB 8 ALB aa AB AL ... \n", + "4 DZA 12 ALG ae AL DZ ... \n", + "\n", + " ISO4217-currency_minor_unit ISO4217-currency_name \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 2.0 Afghani \n", + "3 2.0 Lek \n", + "4 2.0 Algerian Dinar \n", + "\n", + " ISO4217-currency_numeric_code is_independent Capital Continent TLD \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "2 971.0 Yes Kabul AS .af \n", + "3 8.0 Yes Tirana EU .al \n", + "4 12.0 Yes Algiers AF .dz \n", + "\n", + " Languages geonameid EDGAR \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 fa-AF,ps,uz-AF,tk 1149361.0 B2 \n", + "3 sq,el 783754.0 B3 \n", + "4 ar-DZ 2589581.0 B4 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "country_codes_raw.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Series NameSeries CodeCountry NameCountry Code2002 [YR2002]2003 [YR2003]2004 [YR2004]2005 [YR2005]2006 [YR2006]2007 [YR2007]2008 [YR2008]2009 [YR2009]2010 [YR2010]2011 [YR2011]2012 [YR2012]2013 [YR2013]2014 [YR2014]2015 [YR2015]2016 [YR2016]
0GDP per capita (current US$)NY.GDP.PCAP.CDAfghanistanAFG192.153528278789203.651040923182224.914712193371257.175794656273280.245644106914380.400955186598384.131681276838458.955781585831569.940728793286622.379654358451690.842629014956653.347488111011633.947864294639590.269515382605..
1GDP per capita (current US$)NY.GDP.PCAP.CDAlbaniaALB1453.642776608531890.681557435442416.58823507012709.142930562013005.01290337563603.013685366384370.539646531484114.136544909454094.358831919184437.81199902584247.839852019074412.345578134214588.649440148113965.01680558488..
2GDP per capita (current US$)NY.GDP.PCAP.CDAlgeriaDZA1774.292020799112094.893302132982600.006519725593102.037384226733467.544740085963939.559939398084912.251940819953875.822095424264473.486445681155447.403975565695583.616159501315491.61441356485484.06680561484206.03123244958..
3GDP per capita (current US$)NY.GDP.PCAP.CDAmerican SamoaASM..............................
4GDP per capita (current US$)NY.GDP.PCAP.CDAndorraADO24175.372754252231742.992584753737235.450032314639990.330408567942417.229145698447253.529796311146735.999574501642701.44713625539639.38602121141630.052579297739666.369214744842806.5224483021......
\n", + "
" + ], + "text/plain": [ + " Series Name Series Code Country Name Country Code \\\n", + "0 GDP per capita (current US$) NY.GDP.PCAP.CD Afghanistan AFG \n", + "1 GDP per capita (current US$) NY.GDP.PCAP.CD Albania ALB \n", + "2 GDP per capita (current US$) NY.GDP.PCAP.CD Algeria DZA \n", + "3 GDP per capita (current US$) NY.GDP.PCAP.CD American Samoa ASM \n", + "4 GDP per capita (current US$) NY.GDP.PCAP.CD Andorra ADO \n", + "\n", + " 2002 [YR2002] 2003 [YR2003] 2004 [YR2004] 2005 [YR2005] \\\n", + "0 192.153528278789 203.651040923182 224.914712193371 257.175794656273 \n", + "1 1453.64277660853 1890.68155743544 2416.5882350701 2709.14293056201 \n", + "2 1774.29202079911 2094.89330213298 2600.00651972559 3102.03738422673 \n", + "3 .. .. .. .. \n", + "4 24175.3727542522 31742.9925847537 37235.4500323146 39990.3304085679 \n", + "\n", + " 2006 [YR2006] 2007 [YR2007] 2008 [YR2008] 2009 [YR2009] \\\n", + "0 280.245644106914 380.400955186598 384.131681276838 458.955781585831 \n", + "1 3005.0129033756 3603.01368536638 4370.53964653148 4114.13654490945 \n", + "2 3467.54474008596 3939.55993939808 4912.25194081995 3875.82209542426 \n", + "3 .. .. .. .. \n", + "4 42417.2291456984 47253.5297963111 46735.9995745016 42701.447136255 \n", + "\n", + " 2010 [YR2010] 2011 [YR2011] 2012 [YR2012] 2013 [YR2013] \\\n", + "0 569.940728793286 622.379654358451 690.842629014956 653.347488111011 \n", + "1 4094.35883191918 4437.8119990258 4247.83985201907 4412.34557813421 \n", + "2 4473.48644568115 5447.40397556569 5583.61615950131 5491.6144135648 \n", + "3 .. .. .. .. \n", + "4 39639.386021211 41630.0525792977 39666.3692147448 42806.5224483021 \n", + "\n", + " 2014 [YR2014] 2015 [YR2015] 2016 [YR2016] \n", + "0 633.947864294639 590.269515382605 .. \n", + "1 4588.64944014811 3965.01680558488 .. \n", + "2 5484.0668056148 4206.03123244958 .. \n", + "3 .. .. .. \n", + "4 .. .. .. " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "country_gdp_raw.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
official_name_enISO3166-1-Alpha-2ISO3166-1-Alpha-3ISO4217-currency_alphabetic_code
0Channel IslandsNaNNaNNaN
1SarkNaNNaNNaN
2AfghanistanAFAFGAFN
3AlbaniaALALBALL
4AlgeriaDZDZADZD
5American SamoaASASMUSD
6AndorraADANDEUR
7AngolaAOAGOAOA
8AnguillaAIAIAXCD
9NaNAQATANaN
\n", + "
" + ], + "text/plain": [ + " official_name_en ISO3166-1-Alpha-2 ISO3166-1-Alpha-3 \\\n", + "0 Channel Islands NaN NaN \n", + "1 Sark NaN NaN \n", + "2 Afghanistan AF AFG \n", + "3 Albania AL ALB \n", + "4 Algeria DZ DZA \n", + "5 American Samoa AS ASM \n", + "6 Andorra AD AND \n", + "7 Angola AO AGO \n", + "8 Anguilla AI AIA \n", + "9 NaN AQ ATA \n", + "\n", + " ISO4217-currency_alphabetic_code \n", + "0 NaN \n", + "1 NaN \n", + "2 AFN \n", + "3 ALL \n", + "4 DZD \n", + "5 USD \n", + "6 EUR \n", + "7 AOA \n", + "8 XCD \n", + "9 NaN " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Clean country codes data\n", + "country_codes = country_codes_raw[['official_name_en', 'ISO3166-1-Alpha-2', 'ISO3166-1-Alpha-3', 'ISO4217-currency_alphabetic_code']]\n", + "country_codes.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Clean gdp data\n", + "country_gdp = country_gdp_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)\n", + "country_gdp.columns = ['name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',\n", + " '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecountry_code_3200220032004200520062007200820092010201120122013201420152016
0AfghanistanAFG192.153528278789203.651040923182224.914712193371257.175794656273280.245644106914380.400955186598384.131681276838458.955781585831569.940728793286622.379654358451690.842629014956653.347488111011633.947864294639590.269515382605..
1AlbaniaALB1453.642776608531890.681557435442416.58823507012709.142930562013005.01290337563603.013685366384370.539646531484114.136544909454094.358831919184437.81199902584247.839852019074412.345578134214588.649440148113965.01680558488..
2AlgeriaDZA1774.292020799112094.893302132982600.006519725593102.037384226733467.544740085963939.559939398084912.251940819953875.822095424264473.486445681155447.403975565695583.616159501315491.61441356485484.06680561484206.03123244958..
3American SamoaASM..............................
4AndorraADO24175.372754252231742.992584753737235.450032314639990.330408567942417.229145698447253.529796311146735.999574501642701.44713625539639.38602121141630.052579297739666.369214744842806.5224483021......
\n", + "
" + ], + "text/plain": [ + " name country_code_3 2002 2003 \\\n", + "0 Afghanistan AFG 192.153528278789 203.651040923182 \n", + "1 Albania ALB 1453.64277660853 1890.68155743544 \n", + "2 Algeria DZA 1774.29202079911 2094.89330213298 \n", + "3 American Samoa ASM .. .. \n", + "4 Andorra ADO 24175.3727542522 31742.9925847537 \n", + "\n", + " 2004 2005 2006 2007 \\\n", + "0 224.914712193371 257.175794656273 280.245644106914 380.400955186598 \n", + "1 2416.5882350701 2709.14293056201 3005.0129033756 3603.01368536638 \n", + "2 2600.00651972559 3102.03738422673 3467.54474008596 3939.55993939808 \n", + "3 .. .. .. .. \n", + "4 37235.4500323146 39990.3304085679 42417.2291456984 47253.5297963111 \n", + "\n", + " 2008 2009 2010 2011 \\\n", + "0 384.131681276838 458.955781585831 569.940728793286 622.379654358451 \n", + "1 4370.53964653148 4114.13654490945 4094.35883191918 4437.8119990258 \n", + "2 4912.25194081995 3875.82209542426 4473.48644568115 5447.40397556569 \n", + "3 .. .. .. .. \n", + "4 46735.9995745016 42701.447136255 39639.386021211 41630.0525792977 \n", + "\n", + " 2012 2013 2014 2015 2016 \n", + "0 690.842629014956 653.347488111011 633.947864294639 590.269515382605 .. \n", + "1 4247.83985201907 4412.34557813421 4588.64944014811 3965.01680558488 .. \n", + "2 5583.61615950131 5491.6144135648 5484.0668056148 4206.03123244958 .. \n", + "3 .. .. .. .. .. \n", + "4 39666.3692147448 42806.5224483021 .. .. .. " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "country_gdp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namecountry_codecurrency_code200220032004200520062007200820092010201120122013201420152016
0AfghanistanAFAFN192.153528278789203.651040923182224.914712193371257.175794656273280.245644106914380.400955186598384.131681276838458.955781585831569.940728793286622.379654358451690.842629014956653.347488111011633.947864294639590.269515382605NaN
1AlbaniaALALL1453.642776608531890.681557435442416.58823507012709.142930562013005.01290337563603.013685366384370.539646531484114.136544909454094.358831919184437.81199902584247.839852019074412.345578134214588.649440148113965.01680558488NaN
2AlgeriaDZDZD1774.292020799112094.893302132982600.006519725593102.037384226733467.544740085963939.559939398084912.251940819953875.822095424264473.486445681155447.403975565695583.616159501315491.61441356485484.06680561484206.03123244958NaN
3American SamoaASUSDNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4AndorraNaNNaN24175.372754252231742.992584753737235.450032314639990.330408567942417.229145698447253.529796311146735.999574501642701.44713625539639.38602121141630.052579297739666.369214744842806.5224483021NaNNaNNaN
5AngolaAOAOA775.765518459716850.0756941296971135.604563550851576.162800666892253.838850038653151.022431052894242.363062340923678.947654471793886.479354325244744.987629495895086.84842580865327.148892192325232.690500541324102.11858969272NaN
6Antigua and BarbudaAGXCD10027.856220583910382.630900454510993.369247266812079.865695041613599.90885730415276.068264669815786.172839506213979.262692588313017.310387548712817.841573309413525.61622013413342.084997705413432.079207920814128.8785463675NaN
7ArgentinaARARS2579.188198988843330.428660068474251.56534660585096.258332475055904.678132283427226.270294527458992.589619196748198.5661113628610332.032366296612800.201856329513040.30639522813027.204822139212324.938785772813431.8783398577NaN
8ArmeniaAMAMD779.829626131555924.4640098177611181.96844542241625.407769126822126.618661071843080.970959435723919.975473810612915.58390590873124.784017861953417.171835999153565.51757492543716.828922544643873.533565806823499.80421759862NaN
9ArubaAWAWG20433.654108816720834.939710123722566.682157632423302.831988005524015.420612270125921.538234140627549.889422497624640.421244121824289.141516132625353.7875446441NaNNaNNaNNaNNaN
10AustraliaAUAUD20059.452072137223440.005322185630440.854878622933982.950427159136084.858977747540957.830439248749628.115127017942715.132261962951845.654860556262216.547129413367646.103852962667652.683214618961995.829697656327.7214484289NaN
11AustriaATEUR26351.375676954232102.930552355836693.402620518238242.042517469940430.99361449946586.650252923551386.376651197547654.187209080846659.840818134451123.561329166348324.254036760650557.803805987451148.358876079643438.8630381343NaN
12AzerbaijanAZAZN763.101212220367883.6140089679611045.026415276421578.367330306672473.085776368373851.437868711725574.603802186134950.294791423755842.805783585767189.691229207657393.771876976237811.62141845827886.459143672745496.34464026248NaN
13Bahamas, TheBSBSD22505.404793479322008.433700598921995.519921621923405.879547932723721.156280058724306.142424304423657.36530622222043.012571170921920.516586758321514.898653162822112.608354726822315.603653388622217.494139207522896.9236868716NaN
14BahrainBHBHD13279.045795139714541.616048514116275.180423643318418.072674376819669.319554563121167.618706213323043.025317401719166.708238235820386.017555111622238.711950205923063.132287042624378.944828418824855.215635008623395.7476902684NaN
15BangladeshBDBDT401.708153328187434.046563233432462.274879844841485.852888073473495.853780201055543.082263122362618.075883559283683.614422272329760.331935200195838.547801702232858.933362587621954.3963997156241086.800086769351211.70153057661NaN
16BarbadosBBBBD11674.936182909212028.898740546812869.336594761514223.777703977515646.81500872616461.829269175916569.604961866516526.254551722715901.432935335515530.894291558915317.139001349515153.821375348215366.292610628815660.6794152314NaN
17BelarusBYBYR1479.465321132191819.47902771632378.374893911963126.367777794623848.586203008914735.956889188736376.173114912845176.044701382035818.85485921586305.773662476536721.834907739687722.123350604368025.304355524685740.45649479562NaN
18BelgiumBEEUR25052.330282330743.956925428335589.712945700136967.282920429738852.361033993944403.831306110248424.589273027344880.560151788744382.879768349247699.807051896144734.452346580346622.467987393147299.860108561240231.2831740081NaN
19BelizeBZBZD3556.616082257193679.792815421433831.609830305883933.23437317984187.232889998664324.831017114444470.220795976034258.842876483244344.136669060884516.233030471494673.555197842644723.594320628254884.36862038184906.9406908683NaN
20BeninBJXOF411.959075281789509.461088697817570.685501333771587.080212422838609.018608681583685.551675107742794.870176064085768.013476447657732.953622726674799.03555913143807.688451017232882.638633279988903.464924007693779.067867757338NaN
21BermudaBMBMD62583.100203458866111.725227003670359.319108879875882.033856033983912.697797684690849.586980643693605.748165397688463.312816137588207.327559732285973.15841645585458.45550788685748.0654143782NaNNaNNaN
22BhutanBTINR897.4453331640141009.006231846881107.920595640681257.548640086611346.085774800461755.161709042871810.576177044011786.810672005882201.293077789362485.787052168492452.151587937412383.044729902642560.522131741312532.45446832741NaN
23BoliviaBOBOB913.575853422027917.36431048617978.3347575767461046.42784283071233.592973729961389.63175026071736.924014151511776.857189623661981.160105257012377.700689728882645.290274331742948.032917630733124.080762123533095.3596925636NaN
24Bosnia and HerzegovinaBABAM1746.486383467232192.647078243672619.753257537272928.263590352823351.963399861524107.996231972164974.662266661374586.206313159474475.088130045654860.781811342444494.640640380474748.040692264214851.660527869264197.80730449044NaN
25BotswanaBWBWP3044.127353389634149.041377488344879.459461547275327.85351767865342.14033622535666.637846925685561.898239985985115.119246672026244.002573872727504.850907448986885.829681069456806.677466730717153.444325352126360.64477568565NaN
26BrazilBRBRL2805.717257230543040.506495588193596.224579760564730.653468381585808.340547159797246.870104790678706.819141903748474.8810657660711121.420954149613039.121649958212157.308217647312071.777987135211728.79938751088538.5899749574NaN
27British Virgin IslandsVGUSDNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
28Brunei DarussalamBNBND17016.943493305518758.980894691522131.946064110626337.918112661931157.688447852532707.704307360237798.393503400727726.48105369531453.224396535441787.021402897841807.653341057539151.234229039140979.641943337836607.9278817468NaN
29BulgariaBGBGN2079.22897491962693.758760411073353.564148543743852.977756472684455.690224185785932.899676593157296.122478585856955.987732534216752.552177748827750.039880030727333.355073039457656.638505549227851.265428042346819.86910816324NaN
.........................................................
187SwedenSESEK29571.704464328336961.425367142442442.220447313343085.35314595746256.471601049553324.379372475655746.842381117546207.059203296252076.430524588559593.684798238957134.077068240460283.2452226758899.979794484550272.9415019928NaN
188SwitzerlandCHCHF41336.721917030347960.564972157753255.976308463354797.546634574157348.927882397563223.46777515172119.560873031169672.004714733774277.120512556788002.609570380583208.6865423584669.292936799685610.842028526180214.7301520483NaN
189Syrian Arab RepublicSYSYP1269.729208256851261.426090071361419.593379226651591.530192412271779.821049261542079.98786380715NaNNaNNaNNaNNaNNaNNaNNaNNaN
190TajikistanTJTJS190.604911236529237.89319314008311.424112807574339.768076770367407.252611650134523.947651220881711.503629870563671.544008840534744.189892165383841.219870374428962.4391248535261048.666905996891113.36635446715925.91188767081NaN
191TanzaniaTZTZS310.207276508511325.550773719717348.052371292798446.157893998536475.908765113372533.172396689392657.728743357642665.344002868127708.521932325363740.383900829825827.52888076909909.330141239446954.618987954616864.857502346707NaN
192ThailandTHTHB2093.979206419372349.384525665412643.478934875352874.386274177593351.117632864013962.75049885444384.782675157454231.140367781265111.909202229175539.494368103965915.221146686866225.052283153875969.940115837915816.44068998475NaN
193Timor-LesteNaNNaN496.17975117256487.394976066002484.088448984765501.429226481591464.835169248192551.720598424389673.374537904780.261110851448875.8365692712651015.715927996631127.108214573551117.731251308191131.230850718241134.42642789183NaN
194TogoTGXOF286.758081572586316.792068666893356.830803515721379.18092890047384.288555620343428.401561823965522.625007010443508.54052572385496.482494202806572.025703222528573.207476512184589.014053731293629.9976439467547.974683363941NaN
195TongaTOTOP1836.290404392512089.217042692562357.146474449592565.385798903572837.075471567412932.962758157123307.282393210813106.902375577723557.737183665044226.692746532944364.309244222144117.310982920964114.05211511279NaNNaN
196Trinidad and TobagoTTTTD7049.610061458368804.518665963910290.519143638412323.136559573214095.995815281116530.180442014621188.11856587714508.813736579915840.442691415918287.390785135418322.32380098620217.032282094321317.44926556820444.0785895112NaN
197TunisiaTNTND2373.836477575762790.004368232873139.537201631563217.968646282653394.428979868893805.152937301744342.823177303084162.50966896834176.591590130244291.876043156834179.464343674714248.891275586154328.904197348813872.51208364171NaN
198TurkeyTRTRY3570.546263412954586.811203158655855.538659839057117.233241321917727.272404537279309.5094778217510382.31816089888623.9496271866610111.517704958910538.435120367110539.370337114610800.357976305310303.89880025019130.02606479616NaN
199TurkmenistanTMTMT969.9700334529851286.014261249641455.936049507561706.956726392452140.455472167152606.741975880033918.934926787184059.961511020834479.012354184575724.537102311126797.721166028347480.321677736928193.720296904456947.84002260404NaN
200Turks and Caicos IslandsTCUSDNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
201TuvaluTVAUD1621.333582660221901.033476280012232.602517483722252.892885632172353.356087729462768.370957320573094.628091723022763.160305378483238.375762739053993.500254302344044.193780224873880.190878611233826.90331140252NaNNaN
202UgandaUGUGX243.674624856652241.694801118865292.843015035141321.435761373761342.837263983568409.87032601315459.109775900068557.523595605241608.813023856722591.438623816353656.398072749334674.341641819883714.567356471156675.573466220693NaN
203UkraineUAUAH879.4750486227771048.522487907631367.352433336291828.717625765982303.018830931843068.608997996963891.037823182072545.48034107352973.996480715623569.757027404773855.42128013183986.282966115623065.164222652662114.95471628444NaN
204United Arab EmiratesAEAED32355.409597242734294.893997600537179.681891964240298.524217856242950.100584130542913.784034225445720.017897979232905.053849405434341.911292148639901.220915551841712.124210913542831.089132268943962.713693201140438.3763627115NaN
205United KingdomGBGBP28301.208332242532575.091962619938305.872685992940047.90596700742534.306261344948428.157452798745195.156927388137166.275965494438292.871131358341020.376964308941294.514800866642294.890115781446278.520212882943734.0001709187NaN
206United StatesUSUSD38166.037840781239677.198348105841921.809761789244307.920584860346437.067117306548061.537661335348401.427340389947001.555349681848374.086793309449781.800656352351433.047090472752660.295104979854398.460009399455836.7926308733NaN
207UruguayUYUYU4088.772541199083622.052284279764117.308853330075220.957396017355877.879607622317009.678159529369062.290535129069415.1535831903211938.275054984914166.557666909615092.472058667616881.3840006216737.97310117515573.9009189374NaN
208UzbekistanUZUZS383.349499748749396.129968981304465.119886944025546.776850185552642.960414644088830.4076942043221023.119641106981181.847359600661377.082140469381544.827772735241719.036196241551877.96451184342052.586781359252132.07244181734NaN
209VanuatuVUVUV1353.92783863981580.510668024661787.93826480141886.388309664992047.088285200652393.33462122012698.021240124322643.475785343132965.752230116733275.023982036733158.42097352733167.055303304653147.96485986836NaNNaN
210Venezuela, RBVEVEF3657.193747686033233.95610632524273.365371570565435.872259948536740.236778586698325.2165825527811224.646702688311534.840600113613581.353377101810754.592879065412771.595036110712265.03113567NaNNaNNaN
211VietnamVNVND477.105876263106530.861849364997606.904378261988699.499778976363796.671573774518919.2092655680781164.612524617121232.369671188341333.583526354811542.670436101491754.54797386411907.564381679722052.319083800892111.13802366815NaN
212Virgin Islands (U.S.)VIUSDNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
213West Bank and GazaNaNNaN1156.217473457691257.698570096891337.565724306731455.187875181151441.461700467421575.563400272891855.457020458821963.201518657192338.719876823032663.537024966112787.169738029172992.200994554272960.778004052452866.80010121789NaN
214Yemen, Rep.YEYER567.821963311624607.915804267557696.054960696925817.082416653512904.6056000552611181.21517607281361.71720852271239.837118866531310.053839295421282.398831872171289.034077636681408.14619260488NaNNaNNaN
215ZambiaZMZMW376.468039539246429.007277894455530.553584072736691.809458586711030.315359725631103.486577118751365.721205225481134.772997829451456.126526180311635.547304352221724.743563874351839.522481121151725.974548640091307.78861109883NaN
216ZimbabweZWZWL499.716414957462448.373192699451451.171588554028443.240135984532414.680115131526397.956872258099327.199083662345594.495968478092674.268695411166768.556409302399850.827696604891905.500321786172931.19818468692890.416087751634NaN
\n", + "

217 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " name country_code currency_code 2002 \\\n", + "0 Afghanistan AF AFN 192.153528278789 \n", + "1 Albania AL ALL 1453.64277660853 \n", + "2 Algeria DZ DZD 1774.29202079911 \n", + "3 American Samoa AS USD NaN \n", + "4 Andorra NaN NaN 24175.3727542522 \n", + "5 Angola AO AOA 775.765518459716 \n", + "6 Antigua and Barbuda AG XCD 10027.8562205839 \n", + "7 Argentina AR ARS 2579.18819898884 \n", + "8 Armenia AM AMD 779.829626131555 \n", + "9 Aruba AW AWG 20433.6541088167 \n", + "10 Australia AU AUD 20059.4520721372 \n", + "11 Austria AT EUR 26351.3756769542 \n", + "12 Azerbaijan AZ AZN 763.101212220367 \n", + "13 Bahamas, The BS BSD 22505.4047934793 \n", + "14 Bahrain BH BHD 13279.0457951397 \n", + "15 Bangladesh BD BDT 401.708153328187 \n", + "16 Barbados BB BBD 11674.9361829092 \n", + "17 Belarus BY BYR 1479.46532113219 \n", + "18 Belgium BE EUR 25052.3302823 \n", + "19 Belize BZ BZD 3556.61608225719 \n", + "20 Benin BJ XOF 411.959075281789 \n", + "21 Bermuda BM BMD 62583.1002034588 \n", + "22 Bhutan BT INR 897.445333164014 \n", + "23 Bolivia BO BOB 913.575853422027 \n", + "24 Bosnia and Herzegovina BA BAM 1746.48638346723 \n", + "25 Botswana BW BWP 3044.12735338963 \n", + "26 Brazil BR BRL 2805.71725723054 \n", + "27 British Virgin Islands VG USD NaN \n", + "28 Brunei Darussalam BN BND 17016.9434933055 \n", + "29 Bulgaria BG BGN 2079.2289749196 \n", + ".. ... ... ... ... \n", + "187 Sweden SE SEK 29571.7044643283 \n", + "188 Switzerland CH CHF 41336.7219170303 \n", + "189 Syrian Arab Republic SY SYP 1269.72920825685 \n", + "190 Tajikistan TJ TJS 190.604911236529 \n", + "191 Tanzania TZ TZS 310.207276508511 \n", + "192 Thailand TH THB 2093.97920641937 \n", + "193 Timor-Leste NaN NaN 496.17975117256 \n", + "194 Togo TG XOF 286.758081572586 \n", + "195 Tonga TO TOP 1836.29040439251 \n", + "196 Trinidad and Tobago TT TTD 7049.61006145836 \n", + "197 Tunisia TN TND 2373.83647757576 \n", + "198 Turkey TR TRY 3570.54626341295 \n", + "199 Turkmenistan TM TMT 969.970033452985 \n", + "200 Turks and Caicos Islands TC USD NaN \n", + "201 Tuvalu TV AUD 1621.33358266022 \n", + "202 Uganda UG UGX 243.674624856652 \n", + "203 Ukraine UA UAH 879.475048622777 \n", + "204 United Arab Emirates AE AED 32355.4095972427 \n", + "205 United Kingdom GB GBP 28301.2083322425 \n", + "206 United States US USD 38166.0378407812 \n", + "207 Uruguay UY UYU 4088.77254119908 \n", + "208 Uzbekistan UZ UZS 383.349499748749 \n", + "209 Vanuatu VU VUV 1353.9278386398 \n", + "210 Venezuela, RB VE VEF 3657.19374768603 \n", + "211 Vietnam VN VND 477.105876263106 \n", + "212 Virgin Islands (U.S.) VI USD NaN \n", + "213 West Bank and Gaza NaN NaN 1156.21747345769 \n", + "214 Yemen, Rep. YE YER 567.821963311624 \n", + "215 Zambia ZM ZMW 376.468039539246 \n", + "216 Zimbabwe ZW ZWL 499.716414957462 \n", + "\n", + " 2003 2004 2005 2006 \\\n", + "0 203.651040923182 224.914712193371 257.175794656273 280.245644106914 \n", + "1 1890.68155743544 2416.5882350701 2709.14293056201 3005.0129033756 \n", + "2 2094.89330213298 2600.00651972559 3102.03738422673 3467.54474008596 \n", + "3 NaN NaN NaN NaN \n", + "4 31742.9925847537 37235.4500323146 39990.3304085679 42417.2291456984 \n", + "5 850.075694129697 1135.60456355085 1576.16280066689 2253.83885003865 \n", + "6 10382.6309004545 10993.3692472668 12079.8656950416 13599.908857304 \n", + "7 3330.42866006847 4251.5653466058 5096.25833247505 5904.67813228342 \n", + "8 924.464009817761 1181.9684454224 1625.40776912682 2126.61866107184 \n", + "9 20834.9397101237 22566.6821576324 23302.8319880055 24015.4206122701 \n", + "10 23440.0053221856 30440.8548786229 33982.9504271591 36084.8589777475 \n", + "11 32102.9305523558 36693.4026205182 38242.0425174699 40430.993614499 \n", + "12 883.614008967961 1045.02641527642 1578.36733030667 2473.08577636837 \n", + "13 22008.4337005989 21995.5199216219 23405.8795479327 23721.1562800587 \n", + "14 14541.6160485141 16275.1804236433 18418.0726743768 19669.3195545631 \n", + "15 434.046563233432 462.274879844841 485.852888073473 495.853780201055 \n", + "16 12028.8987405468 12869.3365947615 14223.7777039775 15646.815008726 \n", + "17 1819.4790277163 2378.37489391196 3126.36777779462 3848.58620300891 \n", + "18 30743.9569254283 35589.7129457001 36967.2829204297 38852.3610339939 \n", + "19 3679.79281542143 3831.60983030588 3933.2343731798 4187.23288999866 \n", + "20 509.461088697817 570.685501333771 587.080212422838 609.018608681583 \n", + "21 66111.7252270036 70359.3191088798 75882.0338560339 83912.6977976846 \n", + "22 1009.00623184688 1107.92059564068 1257.54864008661 1346.08577480046 \n", + "23 917.36431048617 978.334757576746 1046.4278428307 1233.59297372996 \n", + "24 2192.64707824367 2619.75325753727 2928.26359035282 3351.96339986152 \n", + "25 4149.04137748834 4879.45946154727 5327.8535176786 5342.1403362253 \n", + "26 3040.50649558819 3596.22457976056 4730.65346838158 5808.34054715979 \n", + "27 NaN NaN NaN NaN \n", + "28 18758.9808946915 22131.9460641106 26337.9181126619 31157.6884478525 \n", + "29 2693.75876041107 3353.56414854374 3852.97775647268 4455.69022418578 \n", + ".. ... ... ... ... \n", + "187 36961.4253671424 42442.2204473133 43085.353145957 46256.4716010495 \n", + "188 47960.5649721577 53255.9763084633 54797.5466345741 57348.9278823975 \n", + "189 1261.42609007136 1419.59337922665 1591.53019241227 1779.82104926154 \n", + "190 237.89319314008 311.424112807574 339.768076770367 407.252611650134 \n", + "191 325.550773719717 348.052371292798 446.157893998536 475.908765113372 \n", + "192 2349.38452566541 2643.47893487535 2874.38627417759 3351.11763286401 \n", + "193 487.394976066002 484.088448984765 501.429226481591 464.835169248192 \n", + "194 316.792068666893 356.830803515721 379.18092890047 384.288555620343 \n", + "195 2089.21704269256 2357.14647444959 2565.38579890357 2837.07547156741 \n", + "196 8804.5186659639 10290.5191436384 12323.1365595732 14095.9958152811 \n", + "197 2790.00436823287 3139.53720163156 3217.96864628265 3394.42897986889 \n", + "198 4586.81120315865 5855.53865983905 7117.23324132191 7727.27240453727 \n", + "199 1286.01426124964 1455.93604950756 1706.95672639245 2140.45547216715 \n", + "200 NaN NaN NaN NaN \n", + "201 1901.03347628001 2232.60251748372 2252.89288563217 2353.35608772946 \n", + "202 241.694801118865 292.843015035141 321.435761373761 342.837263983568 \n", + "203 1048.52248790763 1367.35243333629 1828.71762576598 2303.01883093184 \n", + "204 34294.8939976005 37179.6818919642 40298.5242178562 42950.1005841305 \n", + "205 32575.0919626199 38305.8726859929 40047.905967007 42534.3062613449 \n", + "206 39677.1983481058 41921.8097617892 44307.9205848603 46437.0671173065 \n", + "207 3622.05228427976 4117.30885333007 5220.95739601735 5877.87960762231 \n", + "208 396.129968981304 465.119886944025 546.776850185552 642.960414644088 \n", + "209 1580.51066802466 1787.9382648014 1886.38830966499 2047.08828520065 \n", + "210 3233.9561063252 4273.36537157056 5435.87225994853 6740.23677858669 \n", + "211 530.861849364997 606.904378261988 699.499778976363 796.671573774518 \n", + "212 NaN NaN NaN NaN \n", + "213 1257.69857009689 1337.56572430673 1455.18787518115 1441.46170046742 \n", + "214 607.915804267557 696.054960696925 817.082416653512 904.605600055261 \n", + "215 429.007277894455 530.553584072736 691.80945858671 1030.31535972563 \n", + "216 448.373192699451 451.171588554028 443.240135984532 414.680115131526 \n", + "\n", + " 2007 2008 2009 2010 \\\n", + "0 380.400955186598 384.131681276838 458.955781585831 569.940728793286 \n", + "1 3603.01368536638 4370.53964653148 4114.13654490945 4094.35883191918 \n", + "2 3939.55993939808 4912.25194081995 3875.82209542426 4473.48644568115 \n", + "3 NaN NaN NaN NaN \n", + "4 47253.5297963111 46735.9995745016 42701.447136255 39639.386021211 \n", + "5 3151.02243105289 4242.36306234092 3678.94765447179 3886.47935432524 \n", + "6 15276.0682646698 15786.1728395062 13979.2626925883 13017.3103875487 \n", + "7 7226.27029452745 8992.58961919674 8198.56611136286 10332.0323662966 \n", + "8 3080.97095943572 3919.97547381061 2915.5839059087 3124.78401786195 \n", + "9 25921.5382341406 27549.8894224976 24640.4212441218 24289.1415161326 \n", + "10 40957.8304392487 49628.1151270179 42715.1322619629 51845.6548605562 \n", + "11 46586.6502529235 51386.3766511975 47654.1872090808 46659.8408181344 \n", + "12 3851.43786871172 5574.60380218613 4950.29479142375 5842.80578358576 \n", + "13 24306.1424243044 23657.365306222 22043.0125711709 21920.5165867583 \n", + "14 21167.6187062133 23043.0253174017 19166.7082382358 20386.0175551116 \n", + "15 543.082263122362 618.075883559283 683.614422272329 760.331935200195 \n", + "16 16461.8292691759 16569.6049618665 16526.2545517227 15901.4329353355 \n", + "17 4735.95688918873 6376.17311491284 5176.04470138203 5818.8548592158 \n", + "18 44403.8313061102 48424.5892730273 44880.5601517887 44382.8797683492 \n", + "19 4324.83101711444 4470.22079597603 4258.84287648324 4344.13666906088 \n", + "20 685.551675107742 794.870176064085 768.013476447657 732.953622726674 \n", + "21 90849.5869806436 93605.7481653976 88463.3128161375 88207.3275597322 \n", + "22 1755.16170904287 1810.57617704401 1786.81067200588 2201.29307778936 \n", + "23 1389.6317502607 1736.92401415151 1776.85718962366 1981.16010525701 \n", + "24 4107.99623197216 4974.66226666137 4586.20631315947 4475.08813004565 \n", + "25 5666.63784692568 5561.89823998598 5115.11924667202 6244.00257387272 \n", + "26 7246.87010479067 8706.81914190374 8474.88106576607 11121.4209541496 \n", + "27 NaN NaN NaN NaN \n", + "28 32707.7043073602 37798.3935034007 27726.481053695 31453.2243965354 \n", + "29 5932.89967659315 7296.12247858585 6955.98773253421 6752.55217774882 \n", + ".. ... ... ... ... \n", + "187 53324.3793724756 55746.8423811175 46207.0592032962 52076.4305245885 \n", + "188 63223.467775151 72119.5608730311 69672.0047147337 74277.1205125567 \n", + "189 2079.98786380715 NaN NaN NaN \n", + "190 523.947651220881 711.503629870563 671.544008840534 744.189892165383 \n", + "191 533.172396689392 657.728743357642 665.344002868127 708.521932325363 \n", + "192 3962.7504988544 4384.78267515745 4231.14036778126 5111.90920222917 \n", + "193 551.720598424389 673.374537904 780.261110851448 875.836569271265 \n", + "194 428.401561823965 522.625007010443 508.54052572385 496.482494202806 \n", + "195 2932.96275815712 3307.28239321081 3106.90237557772 3557.73718366504 \n", + "196 16530.1804420146 21188.118565877 14508.8137365799 15840.4426914159 \n", + "197 3805.15293730174 4342.82317730308 4162.5096689683 4176.59159013024 \n", + "198 9309.50947782175 10382.3181608988 8623.94962718666 10111.5177049589 \n", + "199 2606.74197588003 3918.93492678718 4059.96151102083 4479.01235418457 \n", + "200 NaN NaN NaN NaN \n", + "201 2768.37095732057 3094.62809172302 2763.16030537848 3238.37576273905 \n", + "202 409.87032601315 459.109775900068 557.523595605241 608.813023856722 \n", + "203 3068.60899799696 3891.03782318207 2545.4803410735 2973.99648071562 \n", + "204 42913.7840342254 45720.0178979792 32905.0538494054 34341.9112921486 \n", + "205 48428.1574527987 45195.1569273881 37166.2759654944 38292.8711313583 \n", + "206 48061.5376613353 48401.4273403899 47001.5553496818 48374.0867933094 \n", + "207 7009.67815952936 9062.29053512906 9415.15358319032 11938.2750549849 \n", + "208 830.407694204322 1023.11964110698 1181.84735960066 1377.08214046938 \n", + "209 2393.3346212201 2698.02124012432 2643.47578534313 2965.75223011673 \n", + "210 8325.21658255278 11224.6467026883 11534.8406001136 13581.3533771018 \n", + "211 919.209265568078 1164.61252461712 1232.36967118834 1333.58352635481 \n", + "212 NaN NaN NaN NaN \n", + "213 1575.56340027289 1855.45702045882 1963.20151865719 2338.71987682303 \n", + "214 1181.2151760728 1361.7172085227 1239.83711886653 1310.05383929542 \n", + "215 1103.48657711875 1365.72120522548 1134.77299782945 1456.12652618031 \n", + "216 397.956872258099 327.199083662345 594.495968478092 674.268695411166 \n", + "\n", + " 2011 2012 2013 2014 \\\n", + "0 622.379654358451 690.842629014956 653.347488111011 633.947864294639 \n", + "1 4437.8119990258 4247.83985201907 4412.34557813421 4588.64944014811 \n", + "2 5447.40397556569 5583.61615950131 5491.6144135648 5484.0668056148 \n", + "3 NaN NaN NaN NaN \n", + "4 41630.0525792977 39666.3692147448 42806.5224483021 NaN \n", + "5 4744.98762949589 5086.8484258086 5327.14889219232 5232.69050054132 \n", + "6 12817.8415733094 13525.616220134 13342.0849977054 13432.0792079208 \n", + "7 12800.2018563295 13040.306395228 13027.2048221392 12324.9387857728 \n", + "8 3417.17183599915 3565.5175749254 3716.82892254464 3873.53356580682 \n", + "9 25353.7875446441 NaN NaN NaN \n", + "10 62216.5471294133 67646.1038529626 67652.6832146189 61995.8296976 \n", + "11 51123.5613291663 48324.2540367606 50557.8038059874 51148.3588760796 \n", + "12 7189.69122920765 7393.77187697623 7811.6214184582 7886.45914367274 \n", + "13 21514.8986531628 22112.6083547268 22315.6036533886 22217.4941392075 \n", + "14 22238.7119502059 23063.1322870426 24378.9448284188 24855.2156350086 \n", + "15 838.547801702232 858.933362587621 954.396399715624 1086.80008676935 \n", + "16 15530.8942915589 15317.1390013495 15153.8213753482 15366.2926106288 \n", + "17 6305.77366247653 6721.83490773968 7722.12335060436 8025.30435552468 \n", + "18 47699.8070518961 44734.4523465803 46622.4679873931 47299.8601085612 \n", + "19 4516.23303047149 4673.55519784264 4723.59432062825 4884.3686203818 \n", + "20 799.03555913143 807.688451017232 882.638633279988 903.464924007693 \n", + "21 85973.158416455 85458.455507886 85748.0654143782 NaN \n", + "22 2485.78705216849 2452.15158793741 2383.04472990264 2560.52213174131 \n", + "23 2377.70068972888 2645.29027433174 2948.03291763073 3124.08076212353 \n", + "24 4860.78181134244 4494.64064038047 4748.04069226421 4851.66052786926 \n", + "25 7504.85090744898 6885.82968106945 6806.67746673071 7153.44432535212 \n", + "26 13039.1216499582 12157.3082176473 12071.7779871352 11728.7993875108 \n", + "27 NaN NaN NaN NaN \n", + "28 41787.0214028978 41807.6533410575 39151.2342290391 40979.6419433378 \n", + "29 7750.03988003072 7333.35507303945 7656.63850554922 7851.26542804234 \n", + ".. ... ... ... ... \n", + "187 59593.6847982389 57134.0770682404 60283.24522267 58899.9797944845 \n", + "188 88002.6095703805 83208.68654235 84669.2929367996 85610.8420285261 \n", + "189 NaN NaN NaN NaN \n", + "190 841.219870374428 962.439124853526 1048.66690599689 1113.36635446715 \n", + "191 740.383900829825 827.52888076909 909.330141239446 954.618987954616 \n", + "192 5539.49436810396 5915.22114668686 6225.05228315387 5969.94011583791 \n", + "193 1015.71592799663 1127.10821457355 1117.73125130819 1131.23085071824 \n", + "194 572.025703222528 573.207476512184 589.014053731293 629.9976439467 \n", + "195 4226.69274653294 4364.30924422214 4117.31098292096 4114.05211511279 \n", + "196 18287.3907851354 18322.323800986 20217.0322820943 21317.449265568 \n", + "197 4291.87604315683 4179.46434367471 4248.89127558615 4328.90419734881 \n", + "198 10538.4351203671 10539.3703371146 10800.3579763053 10303.8988002501 \n", + "199 5724.53710231112 6797.72116602834 7480.32167773692 8193.72029690445 \n", + "200 NaN NaN NaN NaN \n", + "201 3993.50025430234 4044.19378022487 3880.19087861123 3826.90331140252 \n", + "202 591.438623816353 656.398072749334 674.341641819883 714.567356471156 \n", + "203 3569.75702740477 3855.4212801318 3986.28296611562 3065.16422265266 \n", + "204 39901.2209155518 41712.1242109135 42831.0891322689 43962.7136932011 \n", + "205 41020.3769643089 41294.5148008666 42294.8901157814 46278.5202128829 \n", + "206 49781.8006563523 51433.0470904727 52660.2951049798 54398.4600093994 \n", + "207 14166.5576669096 15092.4720586676 16881.38400062 16737.973101175 \n", + "208 1544.82777273524 1719.03619624155 1877.9645118434 2052.58678135925 \n", + "209 3275.02398203673 3158.4209735273 3167.05530330465 3147.96485986836 \n", + "210 10754.5928790654 12771.5950361107 12265.03113567 NaN \n", + "211 1542.67043610149 1754.5479738641 1907.56438167972 2052.31908380089 \n", + "212 NaN NaN NaN NaN \n", + "213 2663.53702496611 2787.16973802917 2992.20099455427 2960.77800405245 \n", + "214 1282.39883187217 1289.03407763668 1408.14619260488 NaN \n", + "215 1635.54730435222 1724.74356387435 1839.52248112115 1725.97454864009 \n", + "216 768.556409302399 850.827696604891 905.500321786172 931.19818468692 \n", + "\n", + " 2015 2016 \n", + "0 590.269515382605 NaN \n", + "1 3965.01680558488 NaN \n", + "2 4206.03123244958 NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "5 4102.11858969272 NaN \n", + "6 14128.8785463675 NaN \n", + "7 13431.8783398577 NaN \n", + "8 3499.80421759862 NaN \n", + "9 NaN NaN \n", + "10 56327.7214484289 NaN \n", + "11 43438.8630381343 NaN \n", + "12 5496.34464026248 NaN \n", + "13 22896.9236868716 NaN \n", + "14 23395.7476902684 NaN \n", + "15 1211.70153057661 NaN \n", + "16 15660.6794152314 NaN \n", + "17 5740.45649479562 NaN \n", + "18 40231.2831740081 NaN \n", + "19 4906.9406908683 NaN \n", + "20 779.067867757338 NaN \n", + "21 NaN NaN \n", + "22 2532.45446832741 NaN \n", + "23 3095.3596925636 NaN \n", + "24 4197.80730449044 NaN \n", + "25 6360.64477568565 NaN \n", + "26 8538.5899749574 NaN \n", + "27 NaN NaN \n", + "28 36607.9278817468 NaN \n", + "29 6819.86910816324 NaN \n", + ".. ... ... \n", + "187 50272.9415019928 NaN \n", + "188 80214.7301520483 NaN \n", + "189 NaN NaN \n", + "190 925.91188767081 NaN \n", + "191 864.857502346707 NaN \n", + "192 5816.44068998475 NaN \n", + "193 1134.42642789183 NaN \n", + "194 547.974683363941 NaN \n", + "195 NaN NaN \n", + "196 20444.0785895112 NaN \n", + "197 3872.51208364171 NaN \n", + "198 9130.02606479616 NaN \n", + "199 6947.84002260404 NaN \n", + "200 NaN NaN \n", + "201 NaN NaN \n", + "202 675.573466220693 NaN \n", + "203 2114.95471628444 NaN \n", + "204 40438.3763627115 NaN \n", + "205 43734.0001709187 NaN \n", + "206 55836.7926308733 NaN \n", + "207 15573.9009189374 NaN \n", + "208 2132.07244181734 NaN \n", + "209 NaN NaN \n", + "210 NaN NaN \n", + "211 2111.13802366815 NaN \n", + "212 NaN NaN \n", + "213 2866.80010121789 NaN \n", + "214 NaN NaN \n", + "215 1307.78861109883 NaN \n", + "216 890.416087751634 NaN \n", + "\n", + "[217 rows x 18 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Merde gdp and code\n", + "country_gdp = pd.merge(country_gdp, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')\n", + "country_gdp.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)\n", + "country_gdp = country_gdp.rename(columns = {'ISO3166-1-Alpha-2':'country_code',\n", + " 'ISO4217-currency_alphabetic_code':'currency_code'})\n", + "country_gdp.replace('..', np.nan, inplace=True)\n", + "\n", + "# Reorder columns\n", + "cols = list(country_gdp.columns)\n", + "cols.insert(1, cols.pop(cols.index('country_code')))\n", + "cols.insert(2, cols.pop(cols.index('currency_code')))\n", + "country_gdp = country_gdp.reindex(columns= cols)\n", + "\n", + "country_gdp" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def gdp(country_code, disbursal_date):\n", + " def historical_gdp(array):\n", + " array = np.array(map(float, array))\n", + " array = array[~np.isnan(array)] # Remove NaN\n", + " if len(array) == 0: # No GDP values\n", + " return 0\n", + " return float(np.mean(array, dtype=np.float64))\n", + " \n", + " # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing issues\n", + " try:\n", + " float(country_code)\n", + " return 0\n", + " except:\n", + " if country_code not in list(country_gdp['country_code']):\n", + " return 0 # TODO: Bad solution ? \n", + " \n", + " # Get the historical average GDP if no disbursal date\n", + " all_gdp = country_gdp[country_gdp.country_code == country_code].values[0][3:]\n", + " if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):\n", + " return historical_gdp(all_gdp)\n", + " \n", + " date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)\n", + " # Get the historical average GDP if no GDP for that year\n", + " if pd.isnull(country_gdp[date][country_gdp.country_code == country_code].values[0]):\n", + " return historical_gdp(all_gdp)\n", + " \n", + " return float(country_gdp[date][country_gdp.country_code == country_code].values[0])\n", + "\n", + "sql_ctx.registerFunction('gdp', gdp, pyspark.sql.types.FloatType())\n", + "# gdp('ZA', '2016-12-13T18:22:55Z')\n", + "\n", + "# sql_ctx.sql(\"\"\"\n", + "# SELECT \n", + "# activity, \n", + "# basket_amount,\n", + "# terms.disbursal_date,\n", + "# gdp(location.country_code, terms.disbursal_date) as gdp\n", + "# FROM loans\n", + "# LIMIT 10\n", + "# \"\"\").collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n", + "\n", + "\n", + "train.registerTempTable('loans_train')\n", + "validation.registerTempTable('loans_validation')\n", + "test.registerTempTable('loans_test')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Starts Here - Currency exchange" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Series NameSeries CodeCountry NameCountry Code2002 [YR2002]2003 [YR2003]2004 [YR2004]2005 [YR2005]2006 [YR2006]2007 [YR2007]2008 [YR2008]2009 [YR2009]2010 [YR2010]2011 [YR2011]2012 [YR2012]2013 [YR2013]2014 [YR2014]2015 [YR2015]2016 [YR2016]
213Official exchange rate (LCU per US$, period av...PA.NUS.FCRFWest Bank and GazaWBG..............................
214Official exchange rate (LCU per US$, period av...PA.NUS.FCRFYemen, Rep.YEM175.625183.448333333333184.775833333333191.509166666667197.049166666667198.953333333333199.764166666667202.846666666667219.59213.8214.350833333333214.89214.89214.89..
215Official exchange rate (LCU per US$, period av...PA.NUS.FCRFZambiaZMB4.3985954.733271046498724.778875386435794.463503310515873.603072042582494.002522665036433.745660690087645.046109245212354.7971368754.860665532093495.147252665144135.395887067944466.152816248124498.63235596234196..
216Official exchange rate (LCU per US$, period av...PA.NUS.FCRFZimbabweZWE0.05509829058103380.6982160713057235.0744194146319522.3890396048255164.5473565006469686.771669541756723052073.3381................
217Official exchange rate (LCU per US$, period av...PA.NUS.FCRFEuro areaEMU1.062551666666670.8860341666666670.8053650.804120.7971408333333330.73063750.6826747112398730.7198433597856150.7550449519898350.7193552536091540.7782936014128520.753159181847270.7537307367174020.901658961641278..
\n", + "
" + ], + "text/plain": [ + " Series Name Series Code \\\n", + "213 Official exchange rate (LCU per US$, period av... PA.NUS.FCRF \n", + "214 Official exchange rate (LCU per US$, period av... PA.NUS.FCRF \n", + "215 Official exchange rate (LCU per US$, period av... PA.NUS.FCRF \n", + "216 Official exchange rate (LCU per US$, period av... PA.NUS.FCRF \n", + "217 Official exchange rate (LCU per US$, period av... PA.NUS.FCRF \n", + "\n", + " Country Name Country Code 2002 [YR2002] 2003 [YR2003] \\\n", + "213 West Bank and Gaza WBG .. .. \n", + "214 Yemen, Rep. YEM 175.625 183.448333333333 \n", + "215 Zambia ZMB 4.398595 4.73327104649872 \n", + "216 Zimbabwe ZWE 0.0550982905810338 0.698216071305723 \n", + "217 Euro area EMU 1.06255166666667 0.886034166666667 \n", + "\n", + " 2004 [YR2004] 2005 [YR2005] 2006 [YR2006] 2007 [YR2007] \\\n", + "213 .. .. .. .. \n", + "214 184.775833333333 191.509166666667 197.049166666667 198.953333333333 \n", + "215 4.77887538643579 4.46350331051587 3.60307204258249 4.00252266503643 \n", + "216 5.07441941463195 22.3890396048255 164.547356500646 9686.77166954175 \n", + "217 0.805365 0.80412 0.797140833333333 0.7306375 \n", + "\n", + " 2008 [YR2008] 2009 [YR2009] 2010 [YR2010] \\\n", + "213 .. .. .. \n", + "214 199.764166666667 202.846666666667 219.59 \n", + "215 3.74566069008764 5.04610924521235 4.797136875 \n", + "216 6723052073.3381 .. .. \n", + "217 0.682674711239873 0.719843359785615 0.755044951989835 \n", + "\n", + " 2011 [YR2011] 2012 [YR2012] 2013 [YR2013] \\\n", + "213 .. .. .. \n", + "214 213.8 214.350833333333 214.89 \n", + "215 4.86066553209349 5.14725266514413 5.39588706794446 \n", + "216 .. .. .. \n", + "217 0.719355253609154 0.778293601412852 0.75315918184727 \n", + "\n", + " 2014 [YR2014] 2015 [YR2015] 2016 [YR2016] \n", + "213 .. .. .. \n", + "214 214.89 214.89 .. \n", + "215 6.15281624812449 8.63235596234196 .. \n", + "216 .. .. .. \n", + "217 0.753730736717402 0.901658961641278 .. " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "currencies_raw = pd.read_csv(LOCAL_PATH + 'economic-data/currencies.csv')\n", + "currencies_raw.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Cleanup\n", + "currencies = currencies_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)\n", + "currencies.columns = ['country_name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',\n", + " '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
country_namecountry_codecurrency_code200220032004200520062007200820092010201120122013201420152016
213West Bank and GazaNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
214Yemen, Rep.YEYER175.625183.448333333333184.775833333333191.509166666667197.049166666667198.953333333333199.764166666667202.846666666667219.59213.8214.350833333333214.89214.89214.89NaN
215ZambiaZMZMW4.3985954.733271046498724.778875386435794.463503310515873.603072042582494.002522665036433.745660690087645.046109245212354.7971368754.860665532093495.147252665144135.395887067944466.152816248124498.63235596234196NaN
216ZimbabweZWZWL0.05509829058103380.6982160713057235.0744194146319522.3890396048255164.5473565006469686.771669541756723052073.3381NaNNaNNaNNaNNaNNaNNaNNaN
217Euro areaEUEMU1.062551666666670.8860341666666670.8053650.804120.7971408333333330.73063750.6826747112398730.7198433597856150.7550449519898350.7193552536091540.7782936014128520.753159181847270.7537307367174020.901658961641278NaN
\n", + "
" + ], + "text/plain": [ + " country_name country_code currency_code 2002 \\\n", + "213 West Bank and Gaza NaN NaN NaN \n", + "214 Yemen, Rep. YE YER 175.625 \n", + "215 Zambia ZM ZMW 4.398595 \n", + "216 Zimbabwe ZW ZWL 0.0550982905810338 \n", + "217 Euro area EU EMU 1.06255166666667 \n", + "\n", + " 2003 2004 2005 2006 \\\n", + "213 NaN NaN NaN NaN \n", + "214 183.448333333333 184.775833333333 191.509166666667 197.049166666667 \n", + "215 4.73327104649872 4.77887538643579 4.46350331051587 3.60307204258249 \n", + "216 0.698216071305723 5.07441941463195 22.3890396048255 164.547356500646 \n", + "217 0.886034166666667 0.805365 0.80412 0.797140833333333 \n", + "\n", + " 2007 2008 2009 \\\n", + "213 NaN NaN NaN \n", + "214 198.953333333333 199.764166666667 202.846666666667 \n", + "215 4.00252266503643 3.74566069008764 5.04610924521235 \n", + "216 9686.77166954175 6723052073.3381 NaN \n", + "217 0.7306375 0.682674711239873 0.719843359785615 \n", + "\n", + " 2010 2011 2012 \\\n", + "213 NaN NaN NaN \n", + "214 219.59 213.8 214.350833333333 \n", + "215 4.797136875 4.86066553209349 5.14725266514413 \n", + "216 NaN NaN NaN \n", + "217 0.755044951989835 0.719355253609154 0.778293601412852 \n", + "\n", + " 2013 2014 2015 2016 \n", + "213 NaN NaN NaN NaN \n", + "214 214.89 214.89 214.89 NaN \n", + "215 5.39588706794446 6.15281624812449 8.63235596234196 NaN \n", + "216 NaN NaN NaN NaN \n", + "217 0.75315918184727 0.753730736717402 0.901658961641278 NaN " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get ISO 2 code\n", + "currencies = pd.merge(currencies, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')\n", + "currencies.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)\n", + "currencies = currencies.rename(columns = {'ISO3166-1-Alpha-2':'country_code',\n", + " 'ISO4217-currency_alphabetic_code':'currency_code'})\n", + "currencies.replace('..', np.nan, inplace=True)\n", + "\n", + "# Add code for European Union\n", + "currencies.set_value(217, 'country_code', 'EU')\n", + "currencies.set_value(217, 'currency_code', 'EMU')\n", + "\n", + "# Reorder columns\n", + "cols = list(currencies.columns)\n", + "cols.insert(1, cols.pop(cols.index('country_code')))\n", + "cols.insert(2, cols.pop(cols.index('currency_code')))\n", + "currencies = currencies.reindex(columns=cols)\n", + "\n", + "currencies.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def xchange_rate(country_code, disbursal_date):\n", + " def historical_rates(array):\n", + " array = np.array(map(float, array))\n", + " array = array[~np.isnan(array)] # Remove NaN\n", + " if len(array) == 0: # No rate values\n", + " return 1\n", + " return float(np.mean(array, dtype=np.float64))\n", + " \n", + " eu = ['AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR','HU','IE',\n", + " 'IT','LV','LT','LU','MT','NL','PL','PT','RO','SK','SI','ES','SE','GB']\n", + " us = ['AS','GU','MP','PR','UM','VI']\n", + " try:\n", + " float(country_code) # Country code unknown?\n", + " if pd.isnull(country_code):\n", + " return 1 # TODO: Bad solution ??\n", + " except:\n", + " if country_code in eu:\n", + " country_code = 'EU'\n", + " elif country_code in us:\n", + " country_code = 'US'\n", + " if country_code not in list(currencies['country_code']):\n", + " return 1\n", + " \n", + " \n", + " # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing\n", + " try:\n", + " float(country_code)\n", + " return 0\n", + " except:\n", + " if country_code not in list(currencies['country_code']):\n", + " return 0 # TODO: Bad solution \n", + " \n", + " # Get the historical average exchange rate if no disbursal date\n", + " all_rates = currencies[currencies.country_code == country_code].values[0][3:]\n", + " if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):\n", + " return historical_rates(all_rates)\n", + " \n", + " date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)\n", + " # Get the historical average exchange rate if no GDP for that year\n", + " if pd.isnull(currencies[date][currencies.country_code == country_code].values[0]):\n", + " return historical_rates(all_rates)\n", + " \n", + " return float(currencies[date][currencies.country_code == country_code].values[0])\n", + "\n", + "sql_ctx.registerFunction('xchange_rate', xchange_rate, pyspark.sql.types.FloatType())\n", + "# xchange_rate('BE', '2016-12-13T18:22:55Z') \n", + "\n", + "# sql_ctx.sql(\"\"\"\n", + "# SELECT \n", + "# xchange_rate(location.country_code, terms.disbursal_date) as xchange_rate\n", + "# FROM loans\n", + "# LIMIT 20\n", + "# \"\"\").collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def extract_tags(tags):\n", + " value = []\n", + " if len(tags) != 0:\n", + " value = [item for (item,) in tags]\n", + " return ','.join(value)\n", + "\n", + "sql_ctx.registerFunction('extract_tags', extract_tags, pyspark.sql.types.StringType())\n", + "\n", + " \n", + "# # unpack_tags([('volunteer_pick',), ('volunteer_like',)])\n", + "\n", + "# sql_ctx.registerFunction('unpack_tags', unpack_tags)\n", + "# sql_ctx.sql(\n", + "# \"\"\"\n", + "# SELECT \n", + "# unpack_tags(tags) as tags\n", + "# FROM loans\n", + "# LIMIT 20000\n", + "# \"\"\"\n", + "# ).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "\n", + "query = '''\n", + "SELECT\n", + " id,\n", + " activity,\n", + " size(borrowers) as num_borrowers,\n", + " male_proportion(borrowers) as male_proportion,\n", + " lender_count,\n", + " location.country,\n", + " location.country_code,\n", + " partner_id,\n", + " sector,\n", + " extract_tags(tags) as tags,\n", + " DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n", + " terms.disbursal_amount,\n", + " terms.disbursal_currency,\n", + " terms.disbursal_date,\n", + " size(terms.scheduled_payments) as num_repayments,\n", + " terms.repayment_interval,\n", + " CASE WHEN\n", + " (status = 'defaulted') OR\n", + " (status = 'deleted') OR\n", + " (status = 'issue') OR\n", + " (status = 'inactive_expired') OR\n", + " (status = 'expired') OR\n", + " (status = 'inactive') OR\n", + " (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n", + " gdp(location.country_code, terms.disbursal_date) as gdp,\n", + " xchange_rate(location.country_code, terms.disbursal_date) as xchange_rate,\n", + " status,\n", + " delinquent\n", + " \n", + "FROM {}\n", + "WHERE\n", + " status != 'fundraising' AND\n", + " status != 'funded'\n", + "'''# Removed CASE WHEN (status = 'refunded') as it appears not to be bad loan indicator\n", + "\n", + "train_filtered = sql_ctx.sql(query.format('loans_train'))\n", + "# sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "loans_pd = sql_ctx.sql(query2.format('loans')).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idactivitynum_borrowersmale_proportionlender_countcountrycountry_codepartner_idsectortags...disbursal_amountdisbursal_currencydisbursal_datenum_repaymentsrepayment_intervalbad_loangdpxchange_ratestatusdelinquent
01224Clothing Sales10.00HondurasHN5Clothing...250.0NoneNone0None11884.94116218.520081deletedNone
15320Food Production/Sales10.07GhanaGH19Food...500.0USD2007-03-15T04:52:22Z9Monthly01099.0223390.935248paidNone
29416Food Production/Sales10.03VietnamVN41Food...75.0USD2007-05-17T22:02:42Z6Monthly0919.20929016105.125000paidNone
313512Clothing Sales10.010NigeriaNG20Clothing...450.0USD2007-07-20T16:35:52Z8Monthly01131.147705125.808105paidNone
417608Livestock10.036Cote D'IvoireCI53Agriculture...950.0USD2007-09-20T02:55:18Z12Monthly01078.541504479.266785paidNone
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id activity num_borrowers male_proportion lender_count \\\n", + "0 1224 Clothing Sales 1 0.0 0 \n", + "1 5320 Food Production/Sales 1 0.0 7 \n", + "2 9416 Food Production/Sales 1 0.0 3 \n", + "3 13512 Clothing Sales 1 0.0 10 \n", + "4 17608 Livestock 1 0.0 36 \n", + "\n", + " country country_code partner_id sector tags ... \\\n", + "0 Honduras HN 5 Clothing ... \n", + "1 Ghana GH 19 Food ... \n", + "2 Vietnam VN 41 Food ... \n", + "3 Nigeria NG 20 Clothing ... \n", + "4 Cote D'Ivoire CI 53 Agriculture ... \n", + "\n", + " disbursal_amount disbursal_currency disbursal_date num_repayments \\\n", + "0 250.0 None None 0 \n", + "1 500.0 USD 2007-03-15T04:52:22Z 9 \n", + "2 75.0 USD 2007-05-17T22:02:42Z 6 \n", + "3 450.0 USD 2007-07-20T16:35:52Z 8 \n", + "4 950.0 USD 2007-09-20T02:55:18Z 12 \n", + "\n", + " repayment_interval bad_loan gdp xchange_rate status delinquent \n", + "0 None 1 1884.941162 18.520081 deleted None \n", + "1 Monthly 0 1099.022339 0.935248 paid None \n", + "2 Monthly 0 919.209290 16105.125000 paid None \n", + "3 Monthly 0 1131.147705 125.808105 paid None \n", + "4 Monthly 0 1078.541504 479.266785 paid None \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans_pd.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "loans_pd.fillna(0, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "loans_pd = loans_pd.drop(['status', 'delinquent'], axis=1)\n", + "loans_dummies = pd.get_dummies(loans_pd)\n", + "\n", + "train, test = train_test_split(loans_dummies, test_size=0.30, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "11 0\n", + "47 0\n", + "85 0\n", + "28 1\n", + "93 1\n", + "Name: bad_loan, dtype: int64" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_target = pd.Series(train['bad_loan'])\n", + "train_ids = pd.Series(test['id'])\n", + "\n", + "test_target = pd.Series(test['bad_loan'])\n", + "test_ids = pd.Series(test['id'])\n", + "\n", + "train = train.drop(['id', 'bad_loan'], axis=1)\n", + "test = test.drop(['id', 'bad_loan'], axis=1)\n", + "\n", + "train_target.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train_data = train.as_matrix()\n", + "test_data = test.as_matrix()" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn import svm\n", + "\n", + "clf = svm.SVC(gamma=0.001, C=100.)" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.93333333333333335" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(train_data, train_target)\n", + "\n", + "predicted = clf.predict(test_data)\n", + "np.mean(predicted == test_target)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}