In [1]:
from pyspark.sql import SQLContext

LOCAL_PATH = 'file:///Users/Karl-Loic/Documents/Columbia-University/Fall-2016/Big-Data-Analytics/final-project/kiva-dig/'

In [2]:
sql_ctx = SQLContext(sc)

In [3]:
# TODO: Create permanent database tables for reuse
# Remove previously created metastore DB
import os, shutil

if os.path.exists(LOCAL_PATH + 'metastore_db/'):
    shutil.rmtree('metastore_db/')
loans = sql_ctx.read.json(LOCAL_PATH + 'kiva-data/loans.json')

In [4]:
loans.count()

1014130

In [5]:
loans.printSchema()

root
 |-- activity: string (nullable = true)
 |-- basket_amount: long (nullable = true)
 |-- bonus_credit_eligibility: boolean (nullable = true)
 |-- borrowers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- gender: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)
 |    |    |-- pictured: boolean (nullable = true)
 |-- currency_exchange_loss_amount: double (nullable = true)
 |-- delinquent: boolean (nullable = true)
 |-- description: struct (nullable = true)
 |    |-- languages: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- texts: struct (nullable = true)
 |    |    |-- ar: string (nullable = true)
 |    |    |-- en: string (nullable = true)
 |    |    |-- es: string (nullable = true)
 |    |    |-- fr: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- mn: string (nullable = true)
 |    |    |--

In [13]:
loans.registerTempTable("loans")

def male_proportion(borrowers):
    num_males = 0
    for item in borrowers:
        if item.gender == 'M':
            num_males += 1
            
    return num_males/ float(len(borrowers))

sql_ctx.registerFunction('male_proportion', male_proportion)

bad_loans = sql_ctx.sql("""
SELECT 
    activity, 
    basket_amount,
    size(borrowers) as num_borrowers,
    male_proportion(borrowers),
    terms.disbursal_date,
    
    use,
    tags,
    status,
    delinquent
FROM loans
LIMIT 10
""")

bad_loans.collect()

[Row(activity=u'Clothing Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=None, use=u'Compra de ropa dama, caballero y ni\xf1o.', tags=[], status=u'deleted', delinquent=None),
 Row(activity=u'Food Production/Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-03-15T04:52:22Z', use=u'Working capital', tags=[], status=u'paid', delinquent=None),
 Row(activity=u'Food Production/Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-05-17T22:02:42Z', use=u'Sell bread at the local market', tags=[], status=u'paid', delinquent=None),
 Row(activity=u'Clothing Sales', basket_amount=None, num_borrowers=1, male_proportion(borrowers)=u'0.0', disbursal_date=u'2007-07-20T16:35:52Z', use=u'To buy more clothes and baby things for sale', tags=[], status=u'paid', delinquent=None),
 Row(activity=u'Livestock', basket_amount=None, num_borrowers=1, male_proportion(bor

In [36]:
sql_ctx.sql("""
SELECT
    status,
    COUNT(*)
FROM loans
GROUP BY status
LIMIT 10
""").collect()

[Row(status=u'refunded', count(1)=5504),
 Row(status=u'defaulted', count(1)=21776),
 Row(status=u'in_repayment', count(1)=155748),
 Row(status=u'reviewed', count(1)=3),
 Row(status=u'deleted', count(1)=2722),
 Row(status=u'paid', count(1)=775330),
 Row(status=u'issue', count(1)=199),
 Row(status=u'inactive_expired', count(1)=12421),
 Row(status=u'fundraising', count(1)=3986),
 Row(status=u'expired', count(1)=33773)]

# Starts Here - GDP

### Datasets sources
1. Country codes - https://github.com/datasets/country-codes/tree/master/data
2. GDP Data - http://data.worldbank.org/indicator/NY.GDP.MKTP.CD

In [14]:
import pandas as pd
from datetime import datetime
import numpy as np
import math


# Load country info data
country_codes_raw = pd.read_csv(LOCAL_PATH + 'economic-data/country-codes.csv')
country_gdp_raw = pd.read_csv(LOCAL_PATH + 'economic-data/country-gdp.csv')

In [15]:
country_codes_raw.head()

Unnamed: 0,name,official_name_en,official_name_fr,ISO3166-1-Alpha-2,ISO3166-1-Alpha-3,ISO3166-1-numeric,ITU,MARC,WMO,DS,...,ISO4217-currency_minor_unit,ISO4217-currency_name,ISO4217-currency_numeric_code,is_independent,Capital,Continent,TLD,Languages,geonameid,EDGAR
0,,Channel Islands,Îles Anglo-Normandes,,,830,,,,,...,,,,,,,,,,
1,,Sark,Sercq,,,680,,,,,...,,,,,,,,,,
2,Afghanistan,Afghanistan,Afghanistan,AF,AFG,4,AFG,af,AF,AFG,...,2.0,Afghani,971.0,Yes,Kabul,AS,.af,"fa-AF,ps,uz-AF,tk",1149361.0,B2
3,Albania,Albania,Albanie,AL,ALB,8,ALB,aa,AB,AL,...,2.0,Lek,8.0,Yes,Tirana,EU,.al,"sq,el",783754.0,B3
4,Algeria,Algeria,Algérie,DZ,DZA,12,ALG,ae,AL,DZ,...,2.0,Algerian Dinar,12.0,Yes,Algiers,AF,.dz,ar-DZ,2589581.0,B4


In [16]:
country_gdp_raw.head()

Unnamed: 0,﻿Series Name,Series Code,Country Name,Country Code,2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016]
0,GDP per capita (current US$),NY.GDP.PCAP.CD,Afghanistan,AFG,192.153528278789,203.651040923182,224.914712193371,257.175794656273,280.245644106914,380.400955186598,384.131681276838,458.955781585831,569.940728793286,622.379654358451,690.842629014956,653.347488111011,633.947864294639,590.269515382605,..
1,GDP per capita (current US$),NY.GDP.PCAP.CD,Albania,ALB,1453.64277660853,1890.68155743544,2416.5882350701,2709.14293056201,3005.0129033756,3603.01368536638,4370.53964653148,4114.13654490945,4094.35883191918,4437.8119990258,4247.83985201907,4412.34557813421,4588.64944014811,3965.01680558488,..
2,GDP per capita (current US$),NY.GDP.PCAP.CD,Algeria,DZA,1774.29202079911,2094.89330213298,2600.00651972559,3102.03738422673,3467.54474008596,3939.55993939808,4912.25194081995,3875.82209542426,4473.48644568115,5447.40397556569,5583.61615950131,5491.6144135648,5484.0668056148,4206.03123244958,..
3,GDP per capita (current US$),NY.GDP.PCAP.CD,American Samoa,ASM,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
4,GDP per capita (current US$),NY.GDP.PCAP.CD,Andorra,ADO,24175.3727542522,31742.9925847537,37235.4500323146,39990.3304085679,42417.2291456984,47253.5297963111,46735.9995745016,42701.447136255,39639.386021211,41630.0525792977,39666.3692147448,42806.5224483021,..,..,..


In [17]:
# Clean country codes data
country_codes = country_codes_raw[['official_name_en', 'ISO3166-1-Alpha-2', 'ISO3166-1-Alpha-3', 'ISO4217-currency_alphabetic_code']]
country_codes.head(10)

Unnamed: 0,official_name_en,ISO3166-1-Alpha-2,ISO3166-1-Alpha-3,ISO4217-currency_alphabetic_code
0,Channel Islands,,,
1,Sark,,,
2,Afghanistan,AF,AFG,AFN
3,Albania,AL,ALB,ALL
4,Algeria,DZ,DZA,DZD
5,American Samoa,AS,ASM,USD
6,Andorra,AD,AND,EUR
7,Angola,AO,AGO,AOA
8,Anguilla,AI,AIA,XCD
9,,AQ,ATA,


In [18]:
# Clean gdp data
country_gdp = country_gdp_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)
country_gdp.columns = ['name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',
                       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']

In [19]:
country_gdp.head()

Unnamed: 0,name,country_code_3,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,AFG,192.153528278789,203.651040923182,224.914712193371,257.175794656273,280.245644106914,380.400955186598,384.131681276838,458.955781585831,569.940728793286,622.379654358451,690.842629014956,653.347488111011,633.947864294639,590.269515382605,..
1,Albania,ALB,1453.64277660853,1890.68155743544,2416.5882350701,2709.14293056201,3005.0129033756,3603.01368536638,4370.53964653148,4114.13654490945,4094.35883191918,4437.8119990258,4247.83985201907,4412.34557813421,4588.64944014811,3965.01680558488,..
2,Algeria,DZA,1774.29202079911,2094.89330213298,2600.00651972559,3102.03738422673,3467.54474008596,3939.55993939808,4912.25194081995,3875.82209542426,4473.48644568115,5447.40397556569,5583.61615950131,5491.6144135648,5484.0668056148,4206.03123244958,..
3,American Samoa,ASM,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
4,Andorra,ADO,24175.3727542522,31742.9925847537,37235.4500323146,39990.3304085679,42417.2291456984,47253.5297963111,46735.9995745016,42701.447136255,39639.386021211,41630.0525792977,39666.3692147448,42806.5224483021,..,..,..


In [20]:
# Merde gdp and code
country_gdp = pd.merge(country_gdp, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')
country_gdp.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)
country_gdp = country_gdp.rename(columns = {'ISO3166-1-Alpha-2':'country_code',
                                            'ISO4217-currency_alphabetic_code':'currency_code'})
country_gdp.replace('..', np.nan, inplace=True)

# Reorder columns
cols = list(country_gdp.columns)
cols.insert(1, cols.pop(cols.index('country_code')))
cols.insert(2, cols.pop(cols.index('currency_code')))
country_gdp = country_gdp.reindex(columns= cols)

country_gdp

Unnamed: 0,name,country_code,currency_code,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Afghanistan,AF,AFN,192.153528278789,203.651040923182,224.914712193371,257.175794656273,280.245644106914,380.400955186598,384.131681276838,458.955781585831,569.940728793286,622.379654358451,690.842629014956,653.347488111011,633.947864294639,590.269515382605,
1,Albania,AL,ALL,1453.64277660853,1890.68155743544,2416.5882350701,2709.14293056201,3005.0129033756,3603.01368536638,4370.53964653148,4114.13654490945,4094.35883191918,4437.8119990258,4247.83985201907,4412.34557813421,4588.64944014811,3965.01680558488,
2,Algeria,DZ,DZD,1774.29202079911,2094.89330213298,2600.00651972559,3102.03738422673,3467.54474008596,3939.55993939808,4912.25194081995,3875.82209542426,4473.48644568115,5447.40397556569,5583.61615950131,5491.6144135648,5484.0668056148,4206.03123244958,
3,American Samoa,AS,USD,,,,,,,,,,,,,,,
4,Andorra,,,24175.3727542522,31742.9925847537,37235.4500323146,39990.3304085679,42417.2291456984,47253.5297963111,46735.9995745016,42701.447136255,39639.386021211,41630.0525792977,39666.3692147448,42806.5224483021,,,
5,Angola,AO,AOA,775.765518459716,850.075694129697,1135.60456355085,1576.16280066689,2253.83885003865,3151.02243105289,4242.36306234092,3678.94765447179,3886.47935432524,4744.98762949589,5086.8484258086,5327.14889219232,5232.69050054132,4102.11858969272,
6,Antigua and Barbuda,AG,XCD,10027.8562205839,10382.6309004545,10993.3692472668,12079.8656950416,13599.908857304,15276.0682646698,15786.1728395062,13979.2626925883,13017.3103875487,12817.8415733094,13525.616220134,13342.0849977054,13432.0792079208,14128.8785463675,
7,Argentina,AR,ARS,2579.18819898884,3330.42866006847,4251.5653466058,5096.25833247505,5904.67813228342,7226.27029452745,8992.58961919674,8198.56611136286,10332.0323662966,12800.2018563295,13040.306395228,13027.2048221392,12324.9387857728,13431.8783398577,
8,Armenia,AM,AMD,779.829626131555,924.464009817761,1181.9684454224,1625.40776912682,2126.61866107184,3080.97095943572,3919.97547381061,2915.5839059087,3124.78401786195,3417.17183599915,3565.5175749254,3716.82892254464,3873.53356580682,3499.80421759862,
9,Aruba,AW,AWG,20433.6541088167,20834.9397101237,22566.6821576324,23302.8319880055,24015.4206122701,25921.5382341406,27549.8894224976,24640.4212441218,24289.1415161326,25353.7875446441,,,,,


In [21]:
def gdp(country_code, disbursal_date):
    def historical_gdp(array):
        array = np.array(map(float, array))
        array = array[~np.isnan(array)] # Remove NaN
        if len(array) == 0: # No GDP values
            return 0
        return float(np.mean(array, dtype=np.float64))
        
    # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing issues
    try:
        float(country_code)
        return 0
    except:
        if country_code not in list(country_gdp['country_code']):
            return 0 # TODO: Bad solution ?   
    
    # Get the historical average GDP if no disbursal date
    all_gdp = country_gdp[country_gdp.country_code == country_code].values[0][3:]
    if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):
        return historical_gdp(all_gdp)
    
    date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)
    # Get the historical average GDP if no GDP for that year
    if pd.isnull(country_gdp[date][country_gdp.country_code == country_code].values[0]):
        return historical_gdp(all_gdp)
    
    return float(country_gdp[date][country_gdp.country_code == country_code].values[0])

sql_ctx.registerFunction('gdp', gdp, pyspark.sql.types.FloatType())
# gdp('ZA', '2016-12-13T18:22:55Z')

# sql_ctx.sql("""
# SELECT 
#     activity, 
#     basket_amount,
#     terms.disbursal_date,
#     gdp(location.country_code, terms.disbursal_date) as gdp
# FROM loans
# LIMIT 10
# """).collect()

In [22]:
train, validation, test = loans.randomSplit([.6, .2, .2], 101)


train.registerTempTable('loans_train')
validation.registerTempTable('loans_validation')
test.registerTempTable('loans_test')



# Starts Here - Currency exchange

In [23]:
currencies_raw = pd.read_csv(LOCAL_PATH + 'economic-data/currencies.csv')
currencies_raw.tail()

Unnamed: 0,﻿Series Name,Series Code,Country Name,Country Code,2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016]
213,"Official exchange rate (LCU per US$, period av...",PA.NUS.FCRF,West Bank and Gaza,WBG,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
214,"Official exchange rate (LCU per US$, period av...",PA.NUS.FCRF,"Yemen, Rep.",YEM,175.625,183.448333333333,184.775833333333,191.509166666667,197.049166666667,198.953333333333,199.764166666667,202.846666666667,219.59,213.8,214.350833333333,214.89,214.89,214.89,..
215,"Official exchange rate (LCU per US$, period av...",PA.NUS.FCRF,Zambia,ZMB,4.398595,4.73327104649872,4.77887538643579,4.46350331051587,3.60307204258249,4.00252266503643,3.74566069008764,5.04610924521235,4.797136875,4.86066553209349,5.14725266514413,5.39588706794446,6.15281624812449,8.63235596234196,..
216,"Official exchange rate (LCU per US$, period av...",PA.NUS.FCRF,Zimbabwe,ZWE,0.0550982905810338,0.698216071305723,5.07441941463195,22.3890396048255,164.547356500646,9686.77166954175,6723052073.3381,..,..,..,..,..,..,..,..
217,"Official exchange rate (LCU per US$, period av...",PA.NUS.FCRF,Euro area,EMU,1.06255166666667,0.886034166666667,0.805365,0.80412,0.797140833333333,0.7306375,0.682674711239873,0.719843359785615,0.755044951989835,0.719355253609154,0.778293601412852,0.75315918184727,0.753730736717402,0.901658961641278,..


In [24]:
# Cleanup
currencies = currencies_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)
currencies.columns = ['country_name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',
                       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']

In [25]:
# Get ISO 2 code
currencies = pd.merge(currencies, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')
currencies.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)
currencies = currencies.rename(columns = {'ISO3166-1-Alpha-2':'country_code',
                                            'ISO4217-currency_alphabetic_code':'currency_code'})
currencies.replace('..', np.nan, inplace=True)

# Add code for European Union
currencies.set_value(217, 'country_code', 'EU')
currencies.set_value(217, 'currency_code', 'EMU')

# Reorder columns
cols = list(currencies.columns)
cols.insert(1, cols.pop(cols.index('country_code')))
cols.insert(2, cols.pop(cols.index('currency_code')))
currencies = currencies.reindex(columns=cols)

currencies.tail()

Unnamed: 0,country_name,country_code,currency_code,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
213,West Bank and Gaza,,,,,,,,,,,,,,,,,
214,"Yemen, Rep.",YE,YER,175.625,183.448333333333,184.775833333333,191.509166666667,197.049166666667,198.953333333333,199.764166666667,202.846666666667,219.59,213.8,214.350833333333,214.89,214.89,214.89,
215,Zambia,ZM,ZMW,4.398595,4.73327104649872,4.77887538643579,4.46350331051587,3.60307204258249,4.00252266503643,3.74566069008764,5.04610924521235,4.797136875,4.86066553209349,5.14725266514413,5.39588706794446,6.15281624812449,8.63235596234196,
216,Zimbabwe,ZW,ZWL,0.0550982905810338,0.698216071305723,5.07441941463195,22.3890396048255,164.547356500646,9686.77166954175,6723052073.3381,,,,,,,,
217,Euro area,EU,EMU,1.06255166666667,0.886034166666667,0.805365,0.80412,0.797140833333333,0.7306375,0.682674711239873,0.719843359785615,0.755044951989835,0.719355253609154,0.778293601412852,0.75315918184727,0.753730736717402,0.901658961641278,


In [26]:
def xchange_rate(country_code, disbursal_date):
    def historical_rates(array):
        array = np.array(map(float, array))
        array = array[~np.isnan(array)] # Remove NaN
        if len(array) == 0: # No rate values
            return 1
        return float(np.mean(array, dtype=np.float64))
    
    eu = ['AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR','HU','IE',
          'IT','LV','LT','LU','MT','NL','PL','PT','RO','SK','SI','ES','SE','GB']
    us = ['AS','GU','MP','PR','UM','VI']
    try:
        float(country_code) # Country code unknown?
        if pd.isnull(country_code):
            return 1 # TODO: Bad solution ??
    except:
        if country_code in eu:
            country_code = 'EU'
        elif country_code in us:
            country_code = 'US'
        if country_code not in list(currencies['country_code']):
            return 1
        
        
    # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing
    try:
        float(country_code)
        return 0
    except:
        if country_code not in list(currencies['country_code']):
            return 0 # TODO: Bad solution    
    
    # Get the historical average exchange rate if no disbursal date
    all_rates = currencies[currencies.country_code == country_code].values[0][3:]
    if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):
        return historical_rates(all_rates)
    
    date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)
    # Get the historical average exchange rate if no GDP for that year
    if pd.isnull(currencies[date][currencies.country_code == country_code].values[0]):
        return historical_rates(all_rates)
    
    return float(currencies[date][currencies.country_code == country_code].values[0])

sql_ctx.registerFunction('xchange_rate', xchange_rate, pyspark.sql.types.FloatType())
# xchange_rate('BE', '2016-12-13T18:22:55Z') 

# sql_ctx.sql("""
# SELECT 
#     xchange_rate(location.country_code, terms.disbursal_date) as xchange_rate
# FROM loans
# LIMIT 20
# """).collect()

In [81]:
def extract_tags(tags):
    value = []
    if len(tags) != 0:
        value = [item for (item,) in tags]
    return ','.join(value)

sql_ctx.registerFunction('extract_tags', extract_tags, pyspark.sql.types.StringType())

        
# # unpack_tags([('volunteer_pick',), ('volunteer_like',)])

# sql_ctx.registerFunction('unpack_tags', unpack_tags)
# sql_ctx.sql(
# """
# SELECT 
# unpack_tags(tags) as tags
# FROM loans
# LIMIT 20000
# """
# ).collect()

In [82]:

query = '''
SELECT
    id,
    activity,
    size(borrowers) as num_borrowers,
    male_proportion(borrowers) as male_proportion,
    lender_count,
    location.country,
    location.country_code,
    partner_id,
    sector,
    extract_tags(tags) as tags,
    DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,
    terms.disbursal_amount,
    terms.disbursal_currency,
    terms.disbursal_date,
    size(terms.scheduled_payments) as num_repayments,
    terms.repayment_interval,
    CASE WHEN
        (status = 'defaulted') OR
        (status = 'deleted') OR
        (status = 'issue') OR
        (status = 'inactive_expired') OR
        (status = 'expired') OR
        (status = 'inactive') OR
        (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,
    gdp(location.country_code, terms.disbursal_date) as gdp,
    xchange_rate(location.country_code, terms.disbursal_date) as xchange_rate,
    status,
    delinquent
    
FROM {}
WHERE
    status != 'fundraising' AND
    status != 'funded'
'''# Removed CASE WHEN (status = 'refunded') as it appears not to be bad loan indicator

train_filtered = sql_ctx.sql(query.format('loans_train'))
# sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')

### Prediction

In [163]:
loans_pd = sql_ctx.sql(query2.format('loans')).toPandas()

In [164]:
loans_pd.head()

Unnamed: 0,id,activity,num_borrowers,male_proportion,lender_count,country,country_code,partner_id,sector,tags,...,disbursal_amount,disbursal_currency,disbursal_date,num_repayments,repayment_interval,bad_loan,gdp,xchange_rate,status,delinquent
0,1224,Clothing Sales,1,0.0,0,Honduras,HN,5,Clothing,,...,250.0,,,0,,1,1884.941162,18.520081,deleted,
1,5320,Food Production/Sales,1,0.0,7,Ghana,GH,19,Food,,...,500.0,USD,2007-03-15T04:52:22Z,9,Monthly,0,1099.022339,0.935248,paid,
2,9416,Food Production/Sales,1,0.0,3,Vietnam,VN,41,Food,,...,75.0,USD,2007-05-17T22:02:42Z,6,Monthly,0,919.20929,16105.125,paid,
3,13512,Clothing Sales,1,0.0,10,Nigeria,NG,20,Clothing,,...,450.0,USD,2007-07-20T16:35:52Z,8,Monthly,0,1131.147705,125.808105,paid,
4,17608,Livestock,1,0.0,36,Cote D'Ivoire,CI,53,Agriculture,,...,950.0,USD,2007-09-20T02:55:18Z,12,Monthly,0,1078.541504,479.266785,paid,


In [165]:
from sklearn.model_selection import train_test_split

loans_pd.fillna(0, inplace=True)

In [166]:
loans_pd = loans_pd.drop(['status', 'delinquent'], axis=1)
loans_dummies = pd.get_dummies(loans_pd)

train, test = train_test_split(loans_dummies, test_size=0.30, random_state=42)

In [168]:
train_target = pd.Series(train['bad_loan'])
train_ids = pd.Series(test['id'])

test_target = pd.Series(test['bad_loan'])
test_ids = pd.Series(test['id'])

train = train.drop(['id', 'bad_loan'], axis=1)
test = test.drop(['id', 'bad_loan'], axis=1)

train_target.head()

11    0
47    0
85    0
28    1
93    1
Name: bad_loan, dtype: int64

In [169]:
train_data = train.as_matrix()
test_data = test.as_matrix()

In [172]:
from sklearn import svm

clf = svm.SVC(gamma=0.001, C=100.)

In [173]:
clf.fit(train_data, train_target)

predicted = clf.predict(test_data)
np.mean(predicted == test_target)

0.93333333333333335