kiva-dig/Default Prediction.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sparkSql = (SparkSession.builder\n",
    "         .master(\"local\")\n",
    "         .appName(\"Kiva Exploration\")\n",
    "         .getOrCreate())\n",
    "\n",
    "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
    "loans.registerTempTable('loans')\n",
    "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
    "lenders.registerTempTable('lenders')\n",
    "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
    "loans_lenders.registerTempTable('loans_lenders')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Custom Functions\n",
    "\n",
    "## Gender Ratio\n",
    "\n",
    "0 = All female\n",
    "\n",
    "1 = All male"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pyspark\n",
    "\n",
    "def gender_ratio(array):\n",
    "    num_males = 0\n",
    "    for item in array:\n",
    "        if item.gender == 'M':\n",
    "            num_males += 1\n",
    "            \n",
    "    return float(num_males) / len(array)\n",
    "\n",
    "sparkSql.udf.register('gender_ratio',\n",
    "                      gender_ratio,\n",
    "                      pyspark.sql.types.FloatType())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch GDP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "# Load country info data\n",
    "country_codes_raw = pd.read_csv('economic-data/country-codes.csv')\n",
    "country_gdp_raw = pd.read_csv('economic-data/country-gdp.csv')\n",
    "\n",
    "# Clean country codes data\n",
    "country_codes = country_codes_raw[['official_name_en', 'ISO3166-1-Alpha-2', \n",
    "                                   'ISO3166-1-Alpha-3', 'ISO4217-currency_alphabetic_code']]\n",
    "\n",
    "# Clean gdp data\n",
    "country_gdp = country_gdp_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)\n",
    "country_gdp.columns = ['name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',\n",
    "                       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']\n",
    "\n",
    "# Merge gdp and code\n",
    "country_gdp = pd.merge(country_gdp, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')\n",
    "country_gdp.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)\n",
    "country_gdp = country_gdp.rename(columns = {'ISO3166-1-Alpha-2':'country_code',\n",
    "                                            'ISO4217-currency_alphabetic_code':'currency_code'})\n",
    "country_gdp.replace('..', np.nan, inplace=True)\n",
    "\n",
    "# Reorder columns\n",
    "cols = list(country_gdp.columns)\n",
    "cols.insert(1, cols.pop(cols.index('country_code')))\n",
    "cols.insert(2, cols.pop(cols.index('currency_code')))\n",
    "country_gdp = country_gdp.reindex(columns= cols)\n",
    "\n",
    "def gdp(country_code, disbursal_date):\n",
    "    def historical_gdp(array):\n",
    "        array = np.array(map(float, array))\n",
    "        array = array[~np.isnan(array)] # Remove NaN\n",
    "        if len(array) == 0: # No GDP values\n",
    "            return 0\n",
    "        return float(np.mean(array, dtype=np.float64))\n",
    "        \n",
    "    # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing issues\n",
    "    try:\n",
    "        float(country_code)\n",
    "        return 0\n",
    "    except:\n",
    "        if country_code not in list(country_gdp['country_code']):\n",
    "            return 0 # TODO: Bad solution ?   \n",
    "    \n",
    "    # Get the historical average GDP if no disbursal date\n",
    "    all_gdp = country_gdp[country_gdp.country_code == country_code].values[0][3:]\n",
    "    if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):\n",
    "        return historical_gdp(all_gdp)\n",
    "    \n",
    "    date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)\n",
    "    # Get the historical average GDP if no GDP for that year\n",
    "    if pd.isnull(country_gdp[date][country_gdp.country_code == country_code].values[0]):\n",
    "        return historical_gdp(all_gdp)\n",
    "    \n",
    "    return float(country_gdp[date][country_gdp.country_code == country_code].values[0])\n",
    "\n",
    "sparkSql.udf.register('gdp', gdp, pyspark.sql.types.FloatType())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch Exchange Rates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "currencies_raw = pd.read_csv('economic-data/currencies.csv')\n",
    "# Cleanup\n",
    "currencies = currencies_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)\n",
    "currencies.columns = ['country_name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',\n",
    "                       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']\n",
    "\n",
    "# Get ISO 2 code\n",
    "currencies = pd.merge(currencies, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')\n",
    "currencies.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)\n",
    "currencies = currencies.rename(columns = {'ISO3166-1-Alpha-2':'country_code',\n",
    "                                            'ISO4217-currency_alphabetic_code':'currency_code'})\n",
    "currencies.replace('..', np.nan, inplace=True)\n",
    "\n",
    "# Add code for European Union\n",
    "currencies.set_value(217, 'country_code', 'EU')\n",
    "currencies.set_value(217, 'currency_code', 'EMU')\n",
    "\n",
    "# Reorder columns\n",
    "cols = list(currencies.columns)\n",
    "cols.insert(1, cols.pop(cols.index('country_code')))\n",
    "cols.insert(2, cols.pop(cols.index('currency_code')))\n",
    "currencies = currencies.reindex(columns=cols)\n",
    "\n",
    "def xchange_rate(country_code, disbursal_date):\n",
    "    def historical_rates(array):\n",
    "        array = np.array(map(float, array))\n",
    "        array = array[~np.isnan(array)] # Remove NaN\n",
    "        if len(array) == 0: # No rate values\n",
    "            return 1\n",
    "        return float(np.mean(array, dtype=np.float64))\n",
    "    \n",
    "    eu = ['AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR','HU','IE',\n",
    "          'IT','LV','LT','LU','MT','NL','PL','PT','RO','SK','SI','ES','SE','GB']\n",
    "    us = ['AS','GU','MP','PR','UM','VI']\n",
    "    try:\n",
    "        float(country_code) # Country code unknown?\n",
    "        if pd.isnull(country_code):\n",
    "            return 1 # TODO: Bad solution ??\n",
    "    except:\n",
    "        if country_code in eu:\n",
    "            country_code = 'EU'\n",
    "        elif country_code in us:\n",
    "            country_code = 'US'\n",
    "        if country_code not in list(currencies['country_code']):\n",
    "            return 1\n",
    "        \n",
    "        \n",
    "    # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing\n",
    "    try:\n",
    "        float(country_code)\n",
    "        return 0\n",
    "    except:\n",
    "        if country_code not in list(currencies['country_code']):\n",
    "            return 0 # TODO: Bad solution    \n",
    "    \n",
    "    # Get the historical average exchange rate if no disbursal date\n",
    "    all_rates = currencies[currencies.country_code == country_code].values[0][3:]\n",
    "    if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):\n",
    "        return historical_rates(all_rates)\n",
    "    \n",
    "    date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)\n",
    "    # Get the historical average exchange rate if no GDP for that year\n",
    "    if pd.isnull(currencies[date][currencies.country_code == country_code].values[0]):\n",
    "        return historical_rates(all_rates)\n",
    "    \n",
    "    return float(currencies[date][currencies.country_code == country_code].values[0])\n",
    "\n",
    "sparkSql.udf.register('xchange_rate', xchange_rate, pyspark.sql.types.FloatType())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fetch actual data\n",
    "\n",
    "Get all data that we are going to use, get dummies, then split into train/validation/test."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Query our datasets to train on."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "query = '''\n",
    "SELECT\n",
    "    id,\n",
    "    activity,\n",
    "    size(borrowers) as num_borrowers,\n",
    "    gender_ratio(borrowers) as gender_ratio,\n",
    "    lender_count,\n",
    "    location.country,\n",
    "    location.country_code,\n",
    "    partner_id,\n",
    "    sector,\n",
    "    tags,\n",
    "    DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n",
    "    terms.disbursal_amount,\n",
    "    terms.disbursal_currency,\n",
    "    terms.disbursal_date,\n",
    "    size(terms.scheduled_payments) as num_repayments,\n",
    "    terms.repayment_interval,\n",
    "    CASE WHEN\n",
    "        (status = 'refunded') OR\n",
    "        (status = 'defaulted') OR\n",
    "        (status = 'deleted') OR\n",
    "        (status = 'issue') OR\n",
    "        (status = 'inactive_expired') OR\n",
    "        (status = 'expired') OR\n",
    "        (status = 'inactive') OR\n",
    "        (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n",
    "    gdp(location.country_code, terms.disbursal_date) as gdp,\n",
    "    xchange_rate(location.country_code, terms.disbursal_date) as xchange_rate,\n",
    "    status,\n",
    "    delinquent\n",
    "    \n",
    "FROM loans\n",
    "WHERE\n",
    "    status != 'fundraising' AND\n",
    "    status != 'funded'\n",
    "'''\n",
    "\n",
    "dataset = sparkSql.sql(query).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_columns = [\n",
    "    'activity', 'num_borrowers', 'gender_ratio',\n",
    "    'lender_count', 'country', 'partner_id', 'sector',\n",
    "    'loan_length', 'disbursal_amount', 'disbursal_currency',\n",
    "    'num_repayments', 'repayment_interval', 'gdp', 'xchange_rate'\n",
    "]\n",
    "\n",
    "y_column = ['bad_loan']\n",
    "\n",
    "dummy_set = pd.get_dummies(dataset[X_columns + y_column])\n",
    "dummy_set.to_csv('processed_dummy.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can restart the kernel to clear memory, and start processing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "processed_dummy = pd.read_csv('processed_dummy.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "train, validate, test = np.split(processed_dummy.sample(frac=1, random_state=0),\n",
    "                                 [int(.6*len(processed_dummy)),\n",
    "                                  int(.8*len(processed_dummy))])\n",
    "\n",
    "train.to_csv('processed_train.csv')\n",
    "validate.to_csv('processed_validate.csv')\n",
    "test.to_csv('processed_test.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Testing all the models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Naive guess:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.89836166750827584"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x = train.drop('bad_loan', axis=1)\n",
    "train_y = train['bad_loan']\n",
    "\n",
    "1 - train_y.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished training 1\n",
      "Finished training 0.1\n",
      "Finished training 0.01\n",
      "Finished training 10\n"
     ]
    }
   ],
   "source": [
    "from itertools import product\n",
    "import pickle\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "for C in [1, .1, .01, 10]:\n",
    "    lr = LogisticRegression(C=C)\n",
    "\n",
    "    lr.fit(train_x, train_y)\n",
    "    with open('lr_{}.pickle'.format(C), 'w') as handle:\n",
    "        pickle.dump(lr, handle)\n",
    "        \n",
    "    del(lr)\n",
    "    print(\"Finished training {}\".format(C))\n",
    "    #print(\"C: {}; gamma: {}; score: {}\".format(\n",
    "    #        C, gamma, svc.score(valid_x, valid_y)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/lib/python2.7/dist-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.\n",
      "  warnings.warn(\"Variables are collinear.\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished training 342\n",
      "Finished training 250\n",
      "Finished training 150\n",
      "Finished training 75\n"
     ]
    }
   ],
   "source": [
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "import pickle\n",
    "\n",
    "# Number of columns is 342\n",
    "for n_components in [342, 250, 150, 75]:\n",
    "    lda = LinearDiscriminantAnalysis(n_components=n_components)\n",
    "    lda.fit(train_x, train_y)\n",
    "    with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n",
    "        pickle.dump(lda, handle)\n",
    "    \n",
    "    del(lda)\n",
    "    print(\"Finished training {}\".format(n_components))\n",
    "    #print(\"N_components: {}; score: {}\".format(\n",
    "    #        n_components, lda.score(valid_x, valid_y)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished training 10\n",
      "Finished training 50\n",
      "Finished training 75\n",
      "Finished training 100\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "for n_estimators in [10, 50, 75, 100]:\n",
    "    rf = RandomForestClassifier(n_estimators=n_estimators)\n",
    "    rf.fit(train_x, train_y)\n",
    "    with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n",
    "        pickle.dump(rf, handle)\n",
    "        \n",
    "    del(rf)        \n",
    "    print(\"Finished training {}\".format(n_estimators))\n",
    "    #print(\"N_estimators: {}; score: {}\".format(\n",
    "    #        n_estimators, score(valid_x, valid_y)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n",
    "\n",
    "valid_x = valid.drop('bad_loan', axis=1)\n",
    "valid_y = valid['bad_loan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logistic Regression, C=1; Score: 0.89904155569\n",
      "Logistic Regression, C=0.1; Score: 0.89904155569\n",
      "Logistic Regression, C=0.01; Score: 0.89904155569\n",
      "Logistic Regression, C=10; Score: 0.89904155569\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "lr_params = [1, .1, .01, 10]\n",
    "\n",
    "for C in lr_params:\n",
    "    with open('lr_{}.pickle'.format(C)) as handle:\n",
    "        model = pickle.load(handle)\n",
    "        \n",
    "        score = model.score(valid_x, valid_y)\n",
    "        \n",
    "        print('Logistic Regression, C={}; Score: {}'.format(\n",
    "                C, score\n",
    "            ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Linear Discriminant Analysis, components=342; Score: 0.897066249493\n",
      "Linear Discriminant Analysis, components=250; Score: 0.897066249493\n",
      "Linear Discriminant Analysis, components=150; Score: 0.897066249493\n",
      "Linear Discriminant Analysis, components=75; Score: 0.897066249493\n"
     ]
    }
   ],
   "source": [
    "lda_components = [342, 250, 150, 75]\n",
    "\n",
    "for n_components in lda_components:\n",
    "    with open('lda_{}.pickle'.format(n_components)) as handle:\n",
    "        model = pickle.load(handle)\n",
    "        \n",
    "        score = model.score(valid_x, valid_y)\n",
    "        \n",
    "        print('Linear Discriminant Analysis, components={}; Score: {}'.format(\n",
    "                n_components, score\n",
    "            ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Forests, estimators: 10; Score: 0.934468350545\n",
      "Random Forests, estimators: 50; Score: 0.936468409953\n",
      "Random Forests, estimators: 75; Score: 0.936324841332\n",
      "Random Forests, estimators: 100; Score: 0.936740695268\n"
     ]
    }
   ],
   "source": [
    "rf_estimators = [10, 50, 75, 100]\n",
    "\n",
    "for estimators in rf_estimators:\n",
    "    with open('rf_{}.pickle'.format(estimators)) as handle:\n",
    "        model = pickle.load(handle)\n",
    "        \n",
    "        score = model.score(valid_x, valid_y)\n",
    "        \n",
    "        print('Random Forests, estimators: {}; Score: {}'.format(\n",
    "                estimators, score\n",
    "            ))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}