kiva-dig/Default Prediction.ipynb

893 lines
60 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sparkSql = (SparkSession.builder\n",
" .master(\"local\")\n",
" .appName(\"Kiva Exploration\")\n",
" .getOrCreate())\n",
"\n",
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
"loans.registerTempTable('loans')\n",
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
"lenders.registerTempTable('lenders')\n",
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
"loans_lenders.registerTempTable('loans_lenders')"
]
},
{
2016-11-30 17:42:06 -05:00
"cell_type": "markdown",
"metadata": {},
"source": [
2016-11-30 17:42:06 -05:00
"# Custom Functions\n",
"\n",
"## Gender Ratio\n",
"\n",
"0 = All female\n",
"\n",
"1 = All male"
]
},
{
"cell_type": "code",
2016-11-30 17:42:06 -05:00
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pyspark\n",
"\n",
2016-11-30 17:42:06 -05:00
"def gender_ratio(array):\n",
" num_males = 0\n",
" for item in array:\n",
" if item.gender == 'M':\n",
" num_males += 1\n",
" \n",
" return float(num_males) / len(array)\n",
"\n",
2016-11-30 17:42:06 -05:00
"sparkSql.udf.register('gender_ratio',\n",
" gender_ratio,\n",
" pyspark.sql.types.FloatType())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fetch GDP"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
2016-11-30 17:42:06 -05:00
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from datetime import datetime\n",
"import numpy as np\n",
"\n",
"\n",
"# Load country info data\n",
"country_codes_raw = pd.read_csv('economic-data/country-codes.csv')\n",
2016-11-30 17:42:06 -05:00
"country_gdp_raw = pd.read_csv('economic-data/country-gdp.csv')\n",
"\n",
"# Clean country codes data\n",
"country_codes = country_codes_raw[['official_name_en', 'ISO3166-1-Alpha-2', \n",
" 'ISO3166-1-Alpha-3', 'ISO4217-currency_alphabetic_code']]\n",
"\n",
"# Clean gdp data\n",
"country_gdp = country_gdp_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)\n",
"country_gdp.columns = ['name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',\n",
2016-11-30 17:42:06 -05:00
" '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']\n",
"\n",
"# Merge gdp and code\n",
"country_gdp = pd.merge(country_gdp, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')\n",
"country_gdp.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)\n",
"country_gdp = country_gdp.rename(columns = {'ISO3166-1-Alpha-2':'country_code',\n",
" 'ISO4217-currency_alphabetic_code':'currency_code'})\n",
"country_gdp.replace('..', np.nan, inplace=True)\n",
"\n",
"# Reorder columns\n",
"cols = list(country_gdp.columns)\n",
"cols.insert(1, cols.pop(cols.index('country_code')))\n",
"cols.insert(2, cols.pop(cols.index('currency_code')))\n",
2016-11-30 17:42:06 -05:00
"country_gdp = country_gdp.reindex(columns= cols)\n",
"\n",
"def gdp(country_code, disbursal_date):\n",
" def historical_gdp(array):\n",
" array = np.array(map(float, array))\n",
" array = array[~np.isnan(array)] # Remove NaN\n",
" if len(array) == 0: # No GDP values\n",
" return 0\n",
" return float(np.mean(array, dtype=np.float64))\n",
" \n",
" # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing issues\n",
" try:\n",
" float(country_code)\n",
" return 0\n",
" except:\n",
" if country_code not in list(country_gdp['country_code']):\n",
" return 0 # TODO: Bad solution ? \n",
" \n",
" # Get the historical average GDP if no disbursal date\n",
" all_gdp = country_gdp[country_gdp.country_code == country_code].values[0][3:]\n",
" if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):\n",
" return historical_gdp(all_gdp)\n",
" \n",
" date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)\n",
" # Get the historical average GDP if no GDP for that year\n",
" if pd.isnull(country_gdp[date][country_gdp.country_code == country_code].values[0]):\n",
" return historical_gdp(all_gdp)\n",
" \n",
" return float(country_gdp[date][country_gdp.country_code == country_code].values[0])\n",
"\n",
"sparkSql.udf.register('gdp', gdp, pyspark.sql.types.FloatType())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fetch Exchange Rates"
]
},
{
"cell_type": "code",
2016-11-30 17:42:06 -05:00
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
2016-11-30 17:42:06 -05:00
"currencies_raw = pd.read_csv('economic-data/currencies.csv')\n",
"# Cleanup\n",
"currencies = currencies_raw.drop(country_gdp_raw.columns[[0, 1]], axis=1)\n",
"currencies.columns = ['country_name', 'country_code_3', '2002', '2003', '2004', '2005', '2006',\n",
2016-11-30 17:42:06 -05:00
" '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']\n",
"\n",
"# Get ISO 2 code\n",
"currencies = pd.merge(currencies, country_codes, left_on='country_code_3', right_on='ISO3166-1-Alpha-3', how='left')\n",
"currencies.drop(['official_name_en', 'ISO3166-1-Alpha-3', 'country_code_3'], axis=1, inplace=True)\n",
"currencies = currencies.rename(columns = {'ISO3166-1-Alpha-2':'country_code',\n",
" 'ISO4217-currency_alphabetic_code':'currency_code'})\n",
"currencies.replace('..', np.nan, inplace=True)\n",
"\n",
"# Add code for European Union\n",
"currencies.set_value(217, 'country_code', 'EU')\n",
"currencies.set_value(217, 'currency_code', 'EMU')\n",
"\n",
"# Reorder columns\n",
"cols = list(currencies.columns)\n",
"cols.insert(1, cols.pop(cols.index('country_code')))\n",
"cols.insert(2, cols.pop(cols.index('currency_code')))\n",
2016-11-30 17:42:06 -05:00
"currencies = currencies.reindex(columns=cols)\n",
"\n",
"def xchange_rate(country_code, disbursal_date):\n",
" def historical_rates(array):\n",
" array = np.array(map(float, array))\n",
" array = array[~np.isnan(array)] # Remove NaN\n",
" if len(array) == 0: # No rate values\n",
" return 1\n",
" return float(np.mean(array, dtype=np.float64))\n",
" \n",
" eu = ['AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR','HU','IE',\n",
" 'IT','LV','LT','LU','MT','NL','PL','PT','RO','SK','SI','ES','SE','GB']\n",
" us = ['AS','GU','MP','PR','UM','VI']\n",
" try:\n",
" float(country_code) # Country code unknown?\n",
" if pd.isnull(country_code):\n",
" return 1 # TODO: Bad solution ??\n",
" except:\n",
" if country_code in eu:\n",
" country_code = 'EU'\n",
" elif country_code in us:\n",
" country_code = 'US'\n",
" if country_code not in list(currencies['country_code']):\n",
" return 1\n",
" \n",
" \n",
" # TODO: Unable to resolve country code WorldBank dataset has wrong alpha 3 codes e.g. Andorra causing\n",
" try:\n",
" float(country_code)\n",
" return 0\n",
" except:\n",
" if country_code not in list(currencies['country_code']):\n",
" return 0 # TODO: Bad solution \n",
" \n",
" # Get the historical average exchange rate if no disbursal date\n",
" all_rates = currencies[currencies.country_code == country_code].values[0][3:]\n",
" if (disbursal_date is None): # or (country_gdp[date][country_gdp.country_code == country_code] == float('Nan')):\n",
" return historical_rates(all_rates)\n",
" \n",
" date = str(datetime.strptime(disbursal_date, '%Y-%m-%dT%H:%M:%SZ').year)\n",
" # Get the historical average exchange rate if no GDP for that year\n",
" if pd.isnull(currencies[date][currencies.country_code == country_code].values[0]):\n",
" return historical_rates(all_rates)\n",
" \n",
" return float(currencies[date][currencies.country_code == country_code].values[0])\n",
"\n",
"sparkSql.udf.register('xchange_rate', xchange_rate, pyspark.sql.types.FloatType())"
]
},
2016-11-30 17:42:06 -05:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fetch actual data\n",
"\n",
"Get all data that we are going to use, get dummies, then split into train/validation/test."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Query our datasets to train on."
]
},
{
"cell_type": "code",
2016-11-30 17:42:06 -05:00
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"query = '''\n",
"SELECT\n",
" id,\n",
" activity,\n",
" size(borrowers) as num_borrowers,\n",
" gender_ratio(borrowers) as gender_ratio,\n",
" lender_count,\n",
" location.country,\n",
" location.country_code,\n",
" partner_id,\n",
" sector,\n",
" tags,\n",
" DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n",
" terms.disbursal_amount,\n",
" terms.disbursal_currency,\n",
" terms.disbursal_date,\n",
" size(terms.scheduled_payments) as num_repayments,\n",
" terms.repayment_interval,\n",
" CASE WHEN\n",
" (status = 'refunded') OR\n",
" (status = 'defaulted') OR\n",
" (status = 'deleted') OR\n",
" (status = 'issue') OR\n",
" (status = 'inactive_expired') OR\n",
" (status = 'expired') OR\n",
" (status = 'inactive') OR\n",
" (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n",
" gdp(location.country_code, terms.disbursal_date) as gdp,\n",
" xchange_rate(location.country_code, terms.disbursal_date) as xchange_rate,\n",
" status,\n",
" delinquent\n",
" \n",
"FROM loans\n",
"WHERE\n",
" status != 'fundraising' AND\n",
" status != 'funded'\n",
"'''\n",
"\n",
"dataset = sparkSql.sql(query).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Splits"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_columns = [\n",
" 'activity', 'num_borrowers', 'gender_ratio',\n",
" 'lender_count', 'country', 'partner_id', 'sector',\n",
" 'loan_length', 'disbursal_amount', 'disbursal_currency',\n",
" 'num_repayments', 'repayment_interval', 'gdp', 'xchange_rate'\n",
"]\n",
"\n",
"y_column = ['bad_loan']\n",
"\n",
"dummy_set = pd.get_dummies(dataset[X_columns + y_column])\n",
"dummy_set.to_csv('processed_dummy.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can restart the kernel to clear memory, and start processing."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"processed_dummy = pd.read_csv('processed_dummy.csv', index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"train, validate, test = np.split(processed_dummy.sample(frac=1, random_state=0),\n",
" [int(.6*len(processed_dummy)),\n",
" int(.8*len(processed_dummy))])\n",
"\n",
"train.to_csv('processed_train.csv')\n",
"validate.to_csv('processed_validate.csv')\n",
"test.to_csv('processed_test.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Testing all the models"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)"
2016-11-30 17:42:06 -05:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Naive guess:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
2016-11-30 17:42:06 -05:00
"0.89836166750827584"
]
},
2016-11-30 17:42:06 -05:00
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2016-11-30 17:42:06 -05:00
"train_x = train.drop('bad_loan', axis=1)\n",
"train_y = train['bad_loan']\n",
"\n",
"1 - train_y.mean()"
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 3,
2016-11-30 17:42:06 -05:00
"metadata": {
"collapsed": false
},
"outputs": [
2016-12-03 15:36:46 -05:00
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/lib/python2.7/dist-packages/sklearn/linear_model/sag.py:267: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
" \"the coef_ did not converge\", ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished training 1\n",
"Finished training 0.1\n",
"Finished training 0.01\n",
2016-12-03 15:36:46 -05:00
"Finished training 0.001\n",
"Finished training 0.0001\n"
]
}
],
2016-11-30 17:42:06 -05:00
"source": [
"from itertools import product\n",
"import pickle\n",
"from sklearn.linear_model import LogisticRegression\n",
2016-11-30 17:42:06 -05:00
"\n",
2016-12-03 15:36:46 -05:00
"for C in [1, .1, .01, .001, .0001]:\n",
" lr = LogisticRegression(C=C, solver='sag', max_iter=100)\n",
2016-11-30 17:42:06 -05:00
"\n",
" lr.fit(train_x, train_y)\n",
" with open('lr_{}.pickle'.format(C), 'w') as handle:\n",
" pickle.dump(lr, handle)\n",
2016-11-30 17:42:06 -05:00
" \n",
" del(lr)\n",
2016-12-03 15:36:46 -05:00
" print(\"Finished training {}\".format(C))"
2016-11-30 17:42:06 -05:00
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 4,
2016-11-30 17:42:06 -05:00
"metadata": {
"collapsed": false
2016-11-30 17:42:06 -05:00
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished training 342\n",
"Finished training 150\n",
2016-12-03 15:36:46 -05:00
"Finished training 20\n",
"Finished training 5\n",
"Finished training 1\n"
]
}
],
2016-11-30 17:42:06 -05:00
"source": [
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
"import pickle\n",
2016-11-30 17:42:06 -05:00
"\n",
"# Number of columns is 342\n",
2016-12-03 15:36:46 -05:00
"for n_components in [342, 150, 20, 5, 1]:\n",
" lda = LinearDiscriminantAnalysis(n_components=n_components,\n",
" solver='lsqr')\n",
2016-11-30 17:42:06 -05:00
" lda.fit(train_x, train_y)\n",
" with open('lda_{}.pickle'.format(n_components), 'w') as handle:\n",
" pickle.dump(lda, handle)\n",
" \n",
" del(lda)\n",
2016-12-03 15:36:46 -05:00
" print(\"Finished training {}\".format(n_components))"
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished training 10\n",
2016-12-03 15:36:46 -05:00
"Finished training 5\n",
"Finished training 3\n",
"Finished training 1\n"
]
}
],
2016-11-30 17:42:06 -05:00
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
2016-12-03 15:36:46 -05:00
"for n_estimators in [10, 5, 3, 1]:\n",
2016-11-30 17:42:06 -05:00
" rf = RandomForestClassifier(n_estimators=n_estimators)\n",
" rf.fit(train_x, train_y)\n",
" with open('rf_{}.pickle'.format(n_estimators), 'w') as handle:\n",
" pickle.dump(rf, handle)\n",
" \n",
" del(rf) \n",
2016-12-03 15:36:46 -05:00
" print(\"Finished training {}\".format(n_estimators))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Evaluating the Results"
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n",
"\n",
"valid_x = valid.drop('bad_loan', axis=1)\n",
"valid_y = valid['bad_loan']"
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression, C=1; Score: 0.89904155569\n",
"Logistic Regression, C=0.1; Score: 0.89904155569\n",
"Logistic Regression, C=0.01; Score: 0.89904155569\n",
2016-12-03 15:36:46 -05:00
"Logistic Regression, C=0.001; Score: 0.89904155569\n",
"Logistic Regression, C=0.0001; Score: 0.89904155569\n"
]
}
],
"source": [
"import pickle\n",
2016-12-03 15:36:46 -05:00
"lr_params = [1, .1, .01, .001, .0001]\n",
"\n",
"for C in lr_params:\n",
" with open('lr_{}.pickle'.format(C)) as handle:\n",
" model = pickle.load(handle)\n",
" \n",
" score = model.score(valid_x, valid_y)\n",
" \n",
" print('Logistic Regression, C={}; Score: {}'.format(\n",
" C, score\n",
" ))"
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2016-12-03 15:36:46 -05:00
"Linear Discriminant Analysis, components=342; Score: 0.896526629504\n",
"Linear Discriminant Analysis, components=150; Score: 0.896526629504\n",
"Linear Discriminant Analysis, components=20; Score: 0.896526629504\n",
"Linear Discriminant Analysis, components=5; Score: 0.896526629504\n",
"Linear Discriminant Analysis, components=1; Score: 0.896526629504\n"
]
}
],
"source": [
2016-12-03 15:36:46 -05:00
"lda_components = [342, 150, 20, 5, 1]\n",
"\n",
"for n_components in lda_components:\n",
" with open('lda_{}.pickle'.format(n_components)) as handle:\n",
" model = pickle.load(handle)\n",
" \n",
" score = model.score(valid_x, valid_y)\n",
" \n",
" print('Linear Discriminant Analysis, components={}; Score: {}'.format(\n",
" n_components, score\n",
" ))"
]
},
{
"cell_type": "code",
2016-12-03 15:36:46 -05:00
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2016-12-03 15:36:46 -05:00
"Random Forests, estimators: 10; Score: 0.934577264671\n",
"Random Forests, estimators: 5; Score: 0.93108211135\n",
"Random Forests, estimators: 3; Score: 0.926601780251\n",
"Random Forests, estimators: 1; Score: 0.915037080309\n"
]
}
],
"source": [
2016-12-03 15:36:46 -05:00
"rf_estimators = [10, 5, 3, 1]\n",
"\n",
"for estimators in rf_estimators:\n",
" with open('rf_{}.pickle'.format(estimators)) as handle:\n",
" model = pickle.load(handle)\n",
" \n",
" score = model.score(valid_x, valid_y)\n",
" \n",
" print('Random Forests, estimators: {}; Score: {}'.format(\n",
" estimators, score\n",
" ))"
2016-11-30 17:42:06 -05:00
]
2016-12-03 15:36:46 -05:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Column Significance"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)\n",
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"activity_columns = ('Activity', list(filter(\n",
" lambda x: 'activity' in x,\n",
" train.columns)))\n",
"dcurrency_columns = ('Disbursal Currency', list(filter(\n",
" lambda x: 'disbursal_currency' in x,\n",
" train.columns)))\n",
"country_columns = ('Country', list(filter(\n",
" lambda x: 'country' in x,\n",
" train.columns)))\n",
"sector_columns = ('Sector', list(filter(\n",
" lambda x: 'sector' in x,\n",
" train.columns)))\n",
"repayment_columns = ('Repayment', list(filter(\n",
" lambda x: 'repayment' in x,\n",
" train.columns)))\n",
"nborrowers_columns = ('Number of Borrowers', ['num_borrowers'])\n",
"gratio_columns = ('Gender Ratio', ['gender_ratio'])\n",
"nlenders_columns = ('Lender Count', ['lender_count'])\n",
"damount_columns = ('Disbursal Amount', ['disbursal_amount'])\n",
"\n",
"colgroups = [\n",
" activity_columns, dcurrency_columns, country_columns,\n",
" sector_columns, repayment_columns, nborrowers_columns,\n",
" gratio_columns, nlenders_columns, damount_columns\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Removing columns: Activity\n",
"Score: 93.569\n",
"\n",
"Removing columns: Disbursal Currency\n",
"Score: 93.458\n",
"\n",
"Removing columns: Country\n",
"Score: 93.381\n",
"\n",
"Removing columns: Sector\n",
"Score: 93.628\n",
"\n",
"Removing columns: Repayment\n",
"Score: 93.180\n",
"\n",
"Removing columns: Number of Borrowers\n",
"Score: 93.450\n",
"\n",
"Removing columns: Gender Ratio\n",
"Score: 93.422\n",
"\n",
"Removing columns: Lender Count\n",
"Score: 90.606\n",
"\n",
"Removing columns: Disbursal Amount\n",
"Score: 92.342\n",
"\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"def validate_column_group(train, valid, colgroup):\n",
" sub_group = train.drop(colgroup, axis=1)\n",
" train_x = sub_group.drop('bad_loan', axis=1)\n",
" train_y = sub_group['bad_loan']\n",
" \n",
" rf = RandomForestClassifier(n_estimators=10)\n",
" rf.fit(train_x, train_y)\n",
" \n",
" valid_subgroup = valid.drop(colgroup, axis=1)\n",
" valid_x = valid_subgroup.drop('bad_loan', axis=1)\n",
" valid_y = valid_subgroup['bad_loan']\n",
" score = rf.score(valid_x, valid_y)\n",
" \n",
" return score\n",
" \n",
"for name, colgroup in colgroups:\n",
" score = validate_column_group(train, valid, colgroup)\n",
" \n",
" print 'Removing columns: {}'.format(name)\n",
" print 'Score: {:.3f}'.format(score*100)\n",
" print"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgAAAAF5CAYAAADpvZJuAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzt3XmYVNWd//H3t7obsKFYlAQRQZgYgaCi3W4EjAvKogE1\nJJMhGrPML4YokiBoVBxxARMjgsagcSYzGmNkJpEYWSUuMRpEiN3uQkyiRgUlGoEuFqHpOr8/zi36\ndlHVXVX0Us39vJ6nnq4699y6p05X9/3ec89izjlEREQkWmJtXQARERFpfQoAREREIkgBgIiISAQp\nABAREYkgBQAiIiIRpABAREQkghQAiIiIRJACABERkQhSACAiIhJBCgBEREQiKO8AwMxONrNFZrbe\nzJJmNj6HfU41syoz+9jMXjezrxVWXBEREWkOhbQAdAZeAC4GmlxIwMz6A0uAx4GhwO3Az8zszAKO\nLSIiIs3A9mUxIDNLAuc65xY1kudmYKxz7uhQ2gKgm3PurIIPLiIiIgVrjT4AJwGPpaWtAIa1wrFF\nREQkg9JWOMbBwMa0tI1AVzPr6Jzbmb6DmR0EjAbeAj5u8RKKiIjsPzoB/YEVzrl/ZsvUGgFAIUYD\nv2zrQoiIiLRj5wMPZNvYGgHA+0CvtLReQE2mq//AW/7H/cBgwNG793dYsuSnLVTExn3+85N47727\nAAMmAannYZOCn+nbsqXnur3+sx9//JdIJn+VNR+cDDzd5Ptk0/Bz5r9/Y+rf+zvk8llb09SpU5k3\nb16rHnN/kEu95f63U1zfiZak71thVG+5W7t2LRdccAHsOZdm1hoBwCpgbFraqCA9m6DZfzBQQSy2\nnC996SwqKipapIBN+eIXxzJ//gckk2PwH+UDYExarrH4uk7fNhb4e5Z9AAYFPzNvN1uy57MfeeTh\nvPTSRiBT38mllJWVUleXKmdDudRhw8+Z//6NqX/vbPW378coVLdu3drsu9We5VJvuf/tZP5et/Xf\nfkvQ960wqreCNH4L3TmX1wM/DHAocAyQBL4XvO4bbP8B8PNQ/v5AArgZGIgfPrgLOKORY1QADp5z\nsdgyN2TIma6mpsa1lZqaGjdkyJkuFlvmYIuDMx0sdZB04BwkndmDrkOHTzk4KW3bFgfHZ0hPOljs\njjjiFHfEEadk3T548Mg9n339+vWuY8dPO1iSlm+J69jx0+60004LlbN+e6512PBz5r9/Lu9t9mBQ\nf+nHWNpmv+dx48a1+jH3B7nUW65/Ox07ftrFYkub/XtXjPR9K4zqLXdVVVXOn0OpcI2czwsZBXAc\n8DxQFRzgVqAauD7YfjDQNxRgvAWcDZyBnz9gKvDvzrn0kQF76d37YiZPXs2qVQuJx+MFFLV5xONx\nVq1ayOTJq+nffwK9e5cRj19JPH48vXuPo3//UVx66cu8+eZTTJp0KvH4lZSUDKWkZDjx+Ol885sn\n881vfjYt/XgmTXqG555bzHPPLc6wn9++evVDez77IYccwhtvPMnQoXdQWno0sdgISkuPZujQO3jj\njSfp0qVLqJyj6NPnHPr3H5VzHTb8nPnvn8t7X3rpy/TrV0vnzjMoLT2azp1P47DDzmDy5DVt/nuW\n5pfr384bbzzJ5Mlrmv17JyLZ7dM8AC3FzCqAqqqqqqJs8nHOYWZ7PU/PA+y1LVt6rtvDkskksVh9\nDDd+/HgWLaqfkiFb2XK1r/vn8t4teYxcpdeb5KaQesv1b6etvxMtSd+3wqjeclddXU1lZSVApXOu\nOlu+Yh0FUNTC/5yy/aPKNz3X7WHhkz/AxIkTC36vfS1Loe9dDP/o0+tNclNIve3L387+Qt+3wqje\nmp9aAERERPYjubYAaDVAERGRCFIAICIiEkEKAERERCJIAYCIiEgEKQAQERGJIAUAIiIiEaQAQERE\nJIIUAIiIiESQAgAREZEIUgAgIiISQQoAREREIkgBgIiISAQpABAREYkgBQAiIiIRpABAREQkghQA\niIiIRJACABERkQhSACAiIhJBCgBEREQiSAGAiIhIBCkAEBERiSAFACIiIhGkAEBERCSCFACIiIhE\nkAIAERGRCFIAICIiEkEKAERERCJIAYCIiEgEKQAQERGJIAUAIiIiEaQAQEREJIIUAIiIiESQAgAR\nEZEIUgAgIiISQQoAREREIkgBgIiISAQpABAREYkgBQAiIiIRpABAREQkghQAiIiIRJACABERkQhS\nACAiIhJBCgBEREQiSAGAiIhIBCkAEBERiSAFACIiIhGkAEBERCSCFACIiIhEkAIAERGRCCooADCz\nS8zsTTPbYWbPmtnxTeT/npmtM7PtZva2mc01s46FFVlERET2Vd4BgJl9GbgVmAkcC7wIrDCznlny\nfwX4QZB/EPBN4MvA7ALLLCIiIvuokBaAqcDdzrn7nHPrgEnAdvyJPZNhwB+dc//nnHvbOfcYsAA4\noaASi4iIyD7LKwAwszKgEng8leacc8Bj+BN9Js8AlanbBGb2L8BZwNJCCiwiIiL7rjTP/D2BEmBj\nWvpGYGCmHZxzC4LbA380Mwv2/6lz7uZ8CysiIiLNI98AIG9mdipwNf5WwRrgcODHZvaec25WY/tO\nnTqVbt26NUibOHEiEydObKHSioiItB8LFixgwYIFDdK2bNmS077mW/BzE9wC2A5McM4tCqXfC3Rz\nzp2XYZ+ngFXOue+H0s7H9yPokuU4FUBVVVUVFRUVOZdPREQk6qqrq6msrASodM5VZ8uXVx8A51wt\nUAWMTKUFzfoj8ff6MykHkmlpydC+IiIi0soKuQUwF7jXzKrwTfpT8Sf5ewHM7D7gXefc1UH+xcBU\nM3sBWA18GrgBWOTyaX4QERGRZpN3AOCc+1XQqe8GoBfwAjDaOfdBkOVQYHdolxvxV/w3An2AD4BF\nwDX7UG4RERHZBwV1AnTO3QncmWXb6WmvUyf/Gws5loiIiDQ/rQUgIiISQQoAREREIkgBgIiISAQp\nABAREYkgBQAiIiIRpABAREQkghQAiIiIRJACABERkQhSACAiIhJBCgBEREQiSAGAiIhIBCkAEBER\niSAFACIiIhGkAEBERCSCFACIiIhEkAIAERGRCFIAICIiEkEKAERERCJIAYCIiEgEKQAQERGJIAUA\nIiIiEaQAQEREJIIUAIiIiESQAgAREZEIUgAgIiISQQoAREREIkgBgIiISAQpABAREYkgBQAiIiIR\npABAREQkghQAiIiIRJACgFbknGvrIoiIiAAKAFpcIpFgypSZDBhwBn37nsuAAWcwZcpMEolEWxdN\nREQirLStC7A/SyQSDBs2gbVrLyOZvA4wwDF//gqeeGICq1YtJB6Pt3EpRUQkitQC0IJmzJgTnPzH\n4E/+AEYyOYa1a6dyzTW3tmXxREQkwhQAtKDFi1eSTI7OuC2ZHMOiRStbuUQiIiKeAoAW4pyjtrYz\n9Vf+6Yza2nJ1DBQRkTahAKCFmBllZduAbCd4R1nZNsyyBQgiIiItRwFACxo3bjix2IqM22KxRxg/\nfkQrl0hERMRTANCCZs+ezuDBc4nFllPfEuCIxZYzePA8Zs2a1pbFExGRCFMA0ILi8TirVi1k8uTV\n9O8/ij59zqF//1FMnrxaQwBFRKRNaR6AFhaPx7n99uu4/XbfMVD3/EVEpBioBaAV6eQvIiLFQgGA\niIhIBCkAEBERiSAFACIiIhGkAEBERCSCFACIiIhEkAIAERGRCFIAICIiEkEKAERERCKooADAzC4x\nszfNbIeZPWtmxzeRv5uZzTezDWb2sZmtM7MxhRVZRERE9lXeUwGb2ZeBW4GLgDXAVGCFmR3hnPsw\nQ/4y4DHgfeALwAbgMGDzPpRbRERE9kEhawFMBe52zt0HYGaTgLOBbwI/ypD/34HuwEnOubog7e0C\njisiIiLNJK9bAMHVfCXweCrNOefwV/jDsuw2DlgF3Glm75vZy2Z2lZmp/4GIiEgbybcFoCdQAmxM\nS98IDMyyz78ApwP3A2O
"text/plain": [
"<matplotlib.figure.Figure at 0x7f12b780c810>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"train.groupby('lender_count')['bad_loan'].mean().plot(style='o');"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkUAAAFkCAYAAAAuZk/PAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzt3XmYXXWd5/H3JwJB0EBjJMEWRKSNcaOhBEK3RJwIYWvo\nGVEspRu3oVFcJo7KtGM3ND5NCz4sokZBVGSxZhDa1gYkCiqgbJogDRKCo8HIkkgUEwQSlvzmj3Nu\nc3KtVJKbqrq3Uu/X89RTdc/53nO+98et1IffWW5KKUiSJI13E7rdgCRJUi8wFEmSJGEokiRJAgxF\nkiRJgKFIkiQJMBRJkiQBhiJJkiTAUCRJkgQYiiRJkgBDkSRJEtADoSjJ8UluT7Ki/roxycGN9ROT\nfC7J8iSPJLksyY5t29g5yZVJHk2yNMnpSSa01RyQZH6SVUnuSXLsIL2ckGRxkseT3Jxk77b16+1F\nkiSNTV0PRcCvgROBvYA+4HvAN5NMr9efDRwGvBGYCbwAuLz15Dr8XAVsAcwAjgXeDpzSqNkVuAK4\nFtgD+DRwfpIDGzVHA2cAJwF7ArcD85JMbvQ6ZC+SJGnsSi9+IGyS3wIfpgocDwFvKaV8o143DVgI\nzCil3JrkEOBbwE6llOV1zd8BnwSeX0p5KslpwCGllFc39jEAbFdKObR+fDNwSynlg/XjUAW2c0op\npyeZtL5eRnhYJEnSCOqFmaL/lGRCkrcA2wA3Uc0cbUE1wwNAKWURsATYr140A7ijFYhq84DtgFc0\naq5p29281jaSbFnvq7mfUj+ntZ/XbEAvkiRpjNqi2w0AJHklVQjaGngE+K+llLuT7Ak8UUpZ2faU\nZcDU+uep9eP29a11tw9RMynJRGAH4FnrqJlW/zxlA3oZ7LU9D5gN3AusWledJEn6I1sDuwLzSim/\nHemd9UQoAu6mOtdnO+Ao4MIkM7vb0rCZDVzS7SYkSRrD3gZ8baR30hOhqJTyFPDL+uFtSfYBPghc\nCmyVZFLbDM0UYGn981JgravE6vUADzZqpgxSs7KUsjrJcuDpddQ097O+XgZzL8DFF1/M9OnThyhT\nuzlz5nDWWWd1u40xxTHrjOO28RyzzjhuG2fhwoUcc8wxUP8tHWk9EYoGMQGYCMwHngJmAc2Tm3cB\nbqxrbwI+lmRy47yig4AVVCdBt2oOadvHQfVySilPJplf7+db9X5SPz6nrh+ql5uGeC2rAKZPn85e\ne+21wQMg2G677RyzjeSYdcZx23iOWWcct46NyuknXQ9FSU4Fvk11wvJzqabIXgccVEpZmeRLwJlJ\nHqY63+gc4EellB/Xm/gOcBdwUZITgZ2ATwCfLaU8Wdd8ATihvgrty1TB5ijg0EYrZwIX1OHoVmAO\n1QnfFwCspxevPJMkaYzreigCdgS+ShVmVgD/QRWIvlevn0N1aOsyqtmjq4ETWk8upaxJcjjwearZ\no0epgsxJjZp7kxwGnAV8ALgPeFcp5ZpGzaX1PYlOoTok9lNgdinloUavQ/YiSZLGrq6HolLKu9ez\nfjXw/vprXTW/Bg5fz3aup7rsfqiaucDcTelFkiSNTT11nyKpqb+/v9stjDmOWWcct43nmHXGcett\nPXlH681Jkr2A+fPnz/fkOkmSNsKCBQvo6+sD6CulLBjp/TlTJEmShKFIkiQJMBRJkiQBhiJJkiTA\nUCRJkgQYiiRJkgBDkSRJEmAokiRJAgxFkiRJgKFIkiQJMBRJkiQBhiJJkiTAUCRJkgQYiiRJkgBD\nkSRJEmAokiRJAgxFkiRJgKFIkiQJMBSNmiOPPIojjngjq1ev7nYrkiRpEIaiUXLffbvx7//+r9x/\n//3dbkWSJA3CUDRq3tTtBiRJ0hAMRZIkSRiKJEmSAEORJEkSYCiSJEkCDEWSJEmAoUiSJAkwFEmS\nJAGGIkmSJMBQJEmSBBiKJEmSAEORJEkSYCiSJEkCDEWSJEmAoUiSJAkwFEmSJAGGIkmSJMBQJEmS\nBBiKJEmSgB4IRUn+PsmtSVYmWZbkG0le2lbzgyRrGl9PJ5nbVrNzkiuTPJpkaZLTk0xoqzkgyfwk\nq5Lck+TYQfo5IcniJI8nuTnJ3m3rJyb5XJLlSR5JclmSHYdzTCRJ0ujreigC9gc+A+wLvAHYEvhO\nkmc3agpwHjAFmArsBHy0tbIOP1cBWwAzgGOBtwOnNGp2Ba4ArgX2AD4NnJ/kwEbN0cAZwEnAnsDt\nwLwkkxu9nA0cBrwRmAm8ALh8UwZAkiR13xbdbqCUcmjzcZK3A78B+oAfNlY9Vkp5aB2bmQ28DHh9\nKWU5cEeSfwA+meTkUspTwHuAX5ZSWmFqUZLXAnOA79bL5gDnllIurHs5nioAvRM4Pcmk+ue3lFKu\nq2veASxMsk8p5daOB0KSJHVVL8wUtdueambod23L35bkoSR3JDm1bSZpBnBHHYha5gHbAa9o1FzT\nts15wH4ASbakCmLXtlaWUkr9nP3qRa+hCpLNmkXAkkaNJEkag7o+U9SUJFSHp35YSrmrseoS4FfA\nA8CrgdOBlwJH1eunAsvaNresse72IWomJZkI7AA8ax010+qfpwBPlFJWDlIzdQNeoiRJ6lE9FYqA\nucDLgb9sLiylnN94+LMkS4Frk7y4lLJ4NBvsXHVe+HHHHcc222wDQH9/P/39/d1sSpKknjAwMMDA\nwMBay1asWDGqPfRMKEryWeBQYP9SyoPrKb+l/r47sBhYCuzdVjOl/t7a1tLGsmbNylLK6iTLgafX\nUbO0sY2tkkxqmy1q1qzDe4HjOe+889htt92GLpUkaZwZbKJgwYIF9PX1jVoPPXFOUR2IjqQ6UXrJ\nBjxlT6rzjlqB5ybgVW1XiR0ErAAWNmpmtW3noHo5pZQngfnNmvpw3izgxnrRfOCptpppwC6t7UiS\npLGp6zNF9f2G+oEjgEeTtGZqVpRSViXZDXgr1SX3v6W6nP5M4LpSyp117XeAu4CLkpxIdcn+J4DP\n1mEH4AvACUlOA75MFWyOopqdajkTuCDJfOBWqqvRtgEuACilrEzyJeDMJA8DjwDnAD/yyjNJksa2\nroci4HiqWZ8ftC1/B3Ah8ATV/Ys+CGwL/Br4OvDPrcJSypokhwOfp5rVeZQqyJzUqLk3yWHAWcAH\ngPuAd5VSrmnUXFrPNp1CdUjsp8DstlsBzKE6zHYZMBG4GjhhUwZAkiR1X9dDUSllyEN4pZT7gAM2\nYDu/Bg5fT831VJfdD1Uzl9ZZ0YOvXw28v/6SJEmbiZ44p0iSJKnbDEWSJEkYiiRJkgBDkSRJEmAo\nkiRJAgxFkiRJgKFIkiQJMBRJkiQBhiJJkiTAUCRJkgQYiiRJkgBDkSRJEmAokiRJAgxFkiRJgKFI\nkiQJMBRJkiQBhiJJkiTAUCRJkgQYiiRJkgBDkSRJEmAokiRJAgxFkiRJgKFIkiQJMBRJkiQBhiJJ\nkiTAUCRJkgQYiiRJkgBDkSRJEmAokiRJAgxFkiRJgKFIkiQJMBRJkiQBhiJJkiTAUCRJkgQYiiRJ\nkgBDkSRJEmAokiRJAgxFkiRJgKFIkiQJMBRJkiQBPRCKkvx9kluTrEyyLMk3kry0rWZiks8lWZ7k\nkSSXJdmxrWbnJFcmeTTJ0iSnJ5nQVnNAkvlJViW5J8mxg/RzQpLFSR5PcnOSvTe2F0mSNPZ0PRQB\n+wOfAfYF3gBsCXwnybMbNWcDhwFvBGYCLwAub62sw89VwBbADOBY4O3AKY2aXYErgGuBPYBPA+cn\nObBRczRwBnASsCdwOzAvyeQN7UWSJI1NW3S7gVLKoc3HSd4O/AboA36YZBLwTuAtpZTr6pp3AAuT\n7FNKuRWYDbwMeH0pZTlwR5J/AD6Z5ORSylPAe4BfllI+Wu9qUZLXAnOA79bL5gDnllIurPdzPFUA\neidw+gb2IkmSxqBemClqtz1QgN/Vj/uowtu1rYJSyiJgCbBfvWgGcEcdiFrmAdsBr2jUXNO2r3mt\nbSTZst5Xcz+lfk5rP6/
"text/plain": [
"<matplotlib.figure.Figure at 0x7f129146ea90>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train['lender_count'].plot(kind='hist', bins=200);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, want to get an idea of how well this model will perform in production:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import pickle\n",
"\n",
"train = pd.read_csv('processed_train.csv', index_col=0).dropna(axis=1)\n",
"valid = pd.read_csv('processed_validate.csv', index_col=0).dropna(axis=1)\n",
"\n",
"full = pd.concat((train, valid))\n",
"full_x = full.drop('bad_loan', axis=1)\n",
"full_y = full['bad_loan']\n",
"rf = RandomForestClassifier(n_estimators=10)\n",
"\n",
"rf.fit(full_x, full_y)\n",
"\n",
"with open('rf_validated.pickle', 'w') as handle:\n",
" pickle.dump(rf, handle)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2016-12-09 13:36:22 -05:00
"Model test accuracy: 93.651%\n",
"Naive test accuracy: 89.952%\n"
2016-12-03 15:36:46 -05:00
]
}
],
"source": [
"import pandas as pd\n",
"import pickle\n",
"\n",
"test = pd.read_csv('processed_test.csv', index_col=0).dropna(axis=1)\n",
"test_x = test.drop('bad_loan', axis=1)\n",
"test_y = test['bad_loan']\n",
"with open('rf_validated.pickle') as handle:\n",
" rf = pickle.load(handle)\n",
" score = rf.score(test_x, test_y)\n",
" \n",
2016-12-09 13:36:22 -05:00
" print 'Model test accuracy: {:.3f}%'.format(score*100)\n",
" \n",
"print 'Naive test accuracy: {:.3f}%'.format(\n",
" (1 - test_y.mean())*100\n",
")"
2016-12-03 15:36:46 -05:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
2016-11-30 17:42:06 -05:00
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}