mirror of
https://github.com/bspeice/kiva-dig
synced 2024-12-04 12:48:10 -05:00
Initial code for predicting defaults
Currently just selects out the data we need from the giant JSON file
This commit is contained in:
parent
6a22e5ece7
commit
e7d0a98bc5
226
.ipynb_checkpoints/Default Prediction-checkpoint.ipynb
Normal file
226
.ipynb_checkpoints/Default Prediction-checkpoint.ipynb
Normal file
@ -0,0 +1,226 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sparkSql = (SparkSession.builder\n",
|
||||
" .master(\"local\")\n",
|
||||
" .appName(\"Kiva Exploration\")\n",
|
||||
" .getOrCreate())\n",
|
||||
"\n",
|
||||
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
|
||||
"loans.registerTempTable('loans')\n",
|
||||
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
|
||||
"lenders.registerTempTable('lenders')\n",
|
||||
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
|
||||
"loans_lenders.registerTempTable('loans_lenders')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"root\n",
|
||||
" |-- activity: string (nullable = true)\n",
|
||||
" |-- basket_amount: long (nullable = true)\n",
|
||||
" |-- bonus_credit_eligibility: boolean (nullable = true)\n",
|
||||
" |-- borrowers: array (nullable = true)\n",
|
||||
" | |-- element: struct (containsNull = true)\n",
|
||||
" | | |-- first_name: string (nullable = true)\n",
|
||||
" | | |-- gender: string (nullable = true)\n",
|
||||
" | | |-- last_name: string (nullable = true)\n",
|
||||
" | | |-- pictured: boolean (nullable = true)\n",
|
||||
" |-- currency_exchange_loss_amount: double (nullable = true)\n",
|
||||
" |-- delinquent: boolean (nullable = true)\n",
|
||||
" |-- description: struct (nullable = true)\n",
|
||||
" | |-- languages: array (nullable = true)\n",
|
||||
" | | |-- element: string (containsNull = true)\n",
|
||||
" | |-- texts: struct (nullable = true)\n",
|
||||
" | | |-- ar: string (nullable = true)\n",
|
||||
" | | |-- en: string (nullable = true)\n",
|
||||
" | | |-- es: string (nullable = true)\n",
|
||||
" | | |-- fr: string (nullable = true)\n",
|
||||
" | | |-- id: string (nullable = true)\n",
|
||||
" | | |-- mn: string (nullable = true)\n",
|
||||
" | | |-- pt: string (nullable = true)\n",
|
||||
" | | |-- ru: string (nullable = true)\n",
|
||||
" | | |-- vi: string (nullable = true)\n",
|
||||
" |-- funded_amount: long (nullable = true)\n",
|
||||
" |-- funded_date: string (nullable = true)\n",
|
||||
" |-- id: long (nullable = true)\n",
|
||||
" |-- image: struct (nullable = true)\n",
|
||||
" | |-- id: long (nullable = true)\n",
|
||||
" | |-- template_id: long (nullable = true)\n",
|
||||
" |-- journal_totals: struct (nullable = true)\n",
|
||||
" | |-- bulkEntries: long (nullable = true)\n",
|
||||
" | |-- entries: long (nullable = true)\n",
|
||||
" |-- lender_count: long (nullable = true)\n",
|
||||
" |-- loan_amount: long (nullable = true)\n",
|
||||
" |-- location: struct (nullable = true)\n",
|
||||
" | |-- country: string (nullable = true)\n",
|
||||
" | |-- country_code: string (nullable = true)\n",
|
||||
" | |-- geo: struct (nullable = true)\n",
|
||||
" | | |-- level: string (nullable = true)\n",
|
||||
" | | |-- pairs: string (nullable = true)\n",
|
||||
" | | |-- type: string (nullable = true)\n",
|
||||
" | |-- town: string (nullable = true)\n",
|
||||
" |-- name: string (nullable = true)\n",
|
||||
" |-- paid_amount: double (nullable = true)\n",
|
||||
" |-- paid_date: string (nullable = true)\n",
|
||||
" |-- partner_id: long (nullable = true)\n",
|
||||
" |-- payments: array (nullable = true)\n",
|
||||
" | |-- element: struct (containsNull = true)\n",
|
||||
" | | |-- amount: double (nullable = true)\n",
|
||||
" | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
|
||||
" | | |-- local_amount: double (nullable = true)\n",
|
||||
" | | |-- payment_id: long (nullable = true)\n",
|
||||
" | | |-- processed_date: string (nullable = true)\n",
|
||||
" | | |-- rounded_local_amount: double (nullable = true)\n",
|
||||
" | | |-- settlement_date: string (nullable = true)\n",
|
||||
" |-- planned_expiration_date: string (nullable = true)\n",
|
||||
" |-- posted_date: string (nullable = true)\n",
|
||||
" |-- sector: string (nullable = true)\n",
|
||||
" |-- status: string (nullable = true)\n",
|
||||
" |-- tags: array (nullable = true)\n",
|
||||
" | |-- element: struct (containsNull = true)\n",
|
||||
" | | |-- name: string (nullable = true)\n",
|
||||
" |-- terms: struct (nullable = true)\n",
|
||||
" | |-- disbursal_amount: double (nullable = true)\n",
|
||||
" | |-- disbursal_currency: string (nullable = true)\n",
|
||||
" | |-- disbursal_date: string (nullable = true)\n",
|
||||
" | |-- loan_amount: long (nullable = true)\n",
|
||||
" | |-- local_payments: array (nullable = true)\n",
|
||||
" | | |-- element: struct (containsNull = true)\n",
|
||||
" | | | |-- amount: double (nullable = true)\n",
|
||||
" | | | |-- due_date: string (nullable = true)\n",
|
||||
" | |-- loss_liability: struct (nullable = true)\n",
|
||||
" | | |-- currency_exchange: string (nullable = true)\n",
|
||||
" | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
|
||||
" | | |-- nonpayment: string (nullable = true)\n",
|
||||
" | |-- repayment_interval: string (nullable = true)\n",
|
||||
" | |-- repayment_term: long (nullable = true)\n",
|
||||
" | |-- scheduled_payments: array (nullable = true)\n",
|
||||
" | | |-- element: struct (containsNull = true)\n",
|
||||
" | | | |-- amount: double (nullable = true)\n",
|
||||
" | | | |-- due_date: string (nullable = true)\n",
|
||||
" |-- themes: array (nullable = true)\n",
|
||||
" | |-- element: string (containsNull = true)\n",
|
||||
" |-- translator: struct (nullable = true)\n",
|
||||
" | |-- byline: string (nullable = true)\n",
|
||||
" | |-- image: long (nullable = true)\n",
|
||||
" |-- use: string (nullable = true)\n",
|
||||
" |-- video: struct (nullable = true)\n",
|
||||
" | |-- id: long (nullable = true)\n",
|
||||
" | |-- thumbnailImageId: long (nullable = true)\n",
|
||||
" | |-- title: string (nullable = true)\n",
|
||||
" | |-- youtubeId: string (nullable = true)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loans.printSchema()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pyspark\n",
|
||||
"\n",
|
||||
"def male_proportion(array):\n",
|
||||
" num_males = 0\n",
|
||||
" for item in array:\n",
|
||||
" if item.gender == 'M':\n",
|
||||
" num_males += 1\n",
|
||||
" \n",
|
||||
" return float(num_males) / len(array)\n",
|
||||
"\n",
|
||||
"sparkSql.udf.register('male_proportion',\n",
|
||||
" male_proportion,\n",
|
||||
" pyspark.sql.types.FloatType())\n",
|
||||
"\n",
|
||||
"train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n",
|
||||
"\n",
|
||||
"query = '''\n",
|
||||
"SELECT\n",
|
||||
" id,\n",
|
||||
" activity,\n",
|
||||
" size(borrowers) as num_borrowers,\n",
|
||||
" male_proportion(borrowers) as male_proportion,\n",
|
||||
" lender_count,\n",
|
||||
" location.country,\n",
|
||||
" location.country_code,\n",
|
||||
" partner_id,\n",
|
||||
" sector,\n",
|
||||
" tags,\n",
|
||||
" DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n",
|
||||
" terms.disbursal_amount,\n",
|
||||
" terms.disbursal_currency,\n",
|
||||
" terms.disbursal_date,\n",
|
||||
" size(terms.scheduled_payments) as num_repayments,\n",
|
||||
" terms.repayment_interval,\n",
|
||||
" CASE WHEN\n",
|
||||
" (status = 'refunded') OR\n",
|
||||
" (status = 'defaulted') OR\n",
|
||||
" (status = 'deleted') OR\n",
|
||||
" (status = 'issue') OR\n",
|
||||
" (status = 'inactive_expired') OR\n",
|
||||
" (status = 'expired') OR\n",
|
||||
" (status = 'inactive') OR\n",
|
||||
" (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n",
|
||||
" status,\n",
|
||||
" delinquent\n",
|
||||
" \n",
|
||||
"FROM {}\n",
|
||||
"WHERE\n",
|
||||
" status != 'fundraising' AND\n",
|
||||
" status != 'funded'\n",
|
||||
"'''\n",
|
||||
"\n",
|
||||
"train.registerTempTable('loans_train')\n",
|
||||
"validation.registerTempTable('loans_validation')\n",
|
||||
"test.registerTempTable('loans_test')\n",
|
||||
"\n",
|
||||
"sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 2",
|
||||
"language": "python",
|
||||
"name": "python2"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
226
Default Prediction.ipynb
Normal file
226
Default Prediction.ipynb
Normal file
@ -0,0 +1,226 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sparkSql = (SparkSession.builder\n",
|
||||
" .master(\"local\")\n",
|
||||
" .appName(\"Kiva Exploration\")\n",
|
||||
" .getOrCreate())\n",
|
||||
"\n",
|
||||
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
|
||||
"loans.registerTempTable('loans')\n",
|
||||
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
|
||||
"lenders.registerTempTable('lenders')\n",
|
||||
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
|
||||
"loans_lenders.registerTempTable('loans_lenders')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"root\n",
|
||||
" |-- activity: string (nullable = true)\n",
|
||||
" |-- basket_amount: long (nullable = true)\n",
|
||||
" |-- bonus_credit_eligibility: boolean (nullable = true)\n",
|
||||
" |-- borrowers: array (nullable = true)\n",
|
||||
" | |-- element: struct (containsNull = true)\n",
|
||||
" | | |-- first_name: string (nullable = true)\n",
|
||||
" | | |-- gender: string (nullable = true)\n",
|
||||
" | | |-- last_name: string (nullable = true)\n",
|
||||
" | | |-- pictured: boolean (nullable = true)\n",
|
||||
" |-- currency_exchange_loss_amount: double (nullable = true)\n",
|
||||
" |-- delinquent: boolean (nullable = true)\n",
|
||||
" |-- description: struct (nullable = true)\n",
|
||||
" | |-- languages: array (nullable = true)\n",
|
||||
" | | |-- element: string (containsNull = true)\n",
|
||||
" | |-- texts: struct (nullable = true)\n",
|
||||
" | | |-- ar: string (nullable = true)\n",
|
||||
" | | |-- en: string (nullable = true)\n",
|
||||
" | | |-- es: string (nullable = true)\n",
|
||||
" | | |-- fr: string (nullable = true)\n",
|
||||
" | | |-- id: string (nullable = true)\n",
|
||||
" | | |-- mn: string (nullable = true)\n",
|
||||
" | | |-- pt: string (nullable = true)\n",
|
||||
" | | |-- ru: string (nullable = true)\n",
|
||||
" | | |-- vi: string (nullable = true)\n",
|
||||
" |-- funded_amount: long (nullable = true)\n",
|
||||
" |-- funded_date: string (nullable = true)\n",
|
||||
" |-- id: long (nullable = true)\n",
|
||||
" |-- image: struct (nullable = true)\n",
|
||||
" | |-- id: long (nullable = true)\n",
|
||||
" | |-- template_id: long (nullable = true)\n",
|
||||
" |-- journal_totals: struct (nullable = true)\n",
|
||||
" | |-- bulkEntries: long (nullable = true)\n",
|
||||
" | |-- entries: long (nullable = true)\n",
|
||||
" |-- lender_count: long (nullable = true)\n",
|
||||
" |-- loan_amount: long (nullable = true)\n",
|
||||
" |-- location: struct (nullable = true)\n",
|
||||
" | |-- country: string (nullable = true)\n",
|
||||
" | |-- country_code: string (nullable = true)\n",
|
||||
" | |-- geo: struct (nullable = true)\n",
|
||||
" | | |-- level: string (nullable = true)\n",
|
||||
" | | |-- pairs: string (nullable = true)\n",
|
||||
" | | |-- type: string (nullable = true)\n",
|
||||
" | |-- town: string (nullable = true)\n",
|
||||
" |-- name: string (nullable = true)\n",
|
||||
" |-- paid_amount: double (nullable = true)\n",
|
||||
" |-- paid_date: string (nullable = true)\n",
|
||||
" |-- partner_id: long (nullable = true)\n",
|
||||
" |-- payments: array (nullable = true)\n",
|
||||
" | |-- element: struct (containsNull = true)\n",
|
||||
" | | |-- amount: double (nullable = true)\n",
|
||||
" | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
|
||||
" | | |-- local_amount: double (nullable = true)\n",
|
||||
" | | |-- payment_id: long (nullable = true)\n",
|
||||
" | | |-- processed_date: string (nullable = true)\n",
|
||||
" | | |-- rounded_local_amount: double (nullable = true)\n",
|
||||
" | | |-- settlement_date: string (nullable = true)\n",
|
||||
" |-- planned_expiration_date: string (nullable = true)\n",
|
||||
" |-- posted_date: string (nullable = true)\n",
|
||||
" |-- sector: string (nullable = true)\n",
|
||||
" |-- status: string (nullable = true)\n",
|
||||
" |-- tags: array (nullable = true)\n",
|
||||
" | |-- element: struct (containsNull = true)\n",
|
||||
" | | |-- name: string (nullable = true)\n",
|
||||
" |-- terms: struct (nullable = true)\n",
|
||||
" | |-- disbursal_amount: double (nullable = true)\n",
|
||||
" | |-- disbursal_currency: string (nullable = true)\n",
|
||||
" | |-- disbursal_date: string (nullable = true)\n",
|
||||
" | |-- loan_amount: long (nullable = true)\n",
|
||||
" | |-- local_payments: array (nullable = true)\n",
|
||||
" | | |-- element: struct (containsNull = true)\n",
|
||||
" | | | |-- amount: double (nullable = true)\n",
|
||||
" | | | |-- due_date: string (nullable = true)\n",
|
||||
" | |-- loss_liability: struct (nullable = true)\n",
|
||||
" | | |-- currency_exchange: string (nullable = true)\n",
|
||||
" | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
|
||||
" | | |-- nonpayment: string (nullable = true)\n",
|
||||
" | |-- repayment_interval: string (nullable = true)\n",
|
||||
" | |-- repayment_term: long (nullable = true)\n",
|
||||
" | |-- scheduled_payments: array (nullable = true)\n",
|
||||
" | | |-- element: struct (containsNull = true)\n",
|
||||
" | | | |-- amount: double (nullable = true)\n",
|
||||
" | | | |-- due_date: string (nullable = true)\n",
|
||||
" |-- themes: array (nullable = true)\n",
|
||||
" | |-- element: string (containsNull = true)\n",
|
||||
" |-- translator: struct (nullable = true)\n",
|
||||
" | |-- byline: string (nullable = true)\n",
|
||||
" | |-- image: long (nullable = true)\n",
|
||||
" |-- use: string (nullable = true)\n",
|
||||
" |-- video: struct (nullable = true)\n",
|
||||
" | |-- id: long (nullable = true)\n",
|
||||
" | |-- thumbnailImageId: long (nullable = true)\n",
|
||||
" | |-- title: string (nullable = true)\n",
|
||||
" | |-- youtubeId: string (nullable = true)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loans.printSchema()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pyspark\n",
|
||||
"\n",
|
||||
"def male_proportion(array):\n",
|
||||
" num_males = 0\n",
|
||||
" for item in array:\n",
|
||||
" if item.gender == 'M':\n",
|
||||
" num_males += 1\n",
|
||||
" \n",
|
||||
" return float(num_males) / len(array)\n",
|
||||
"\n",
|
||||
"sparkSql.udf.register('male_proportion',\n",
|
||||
" male_proportion,\n",
|
||||
" pyspark.sql.types.FloatType())\n",
|
||||
"\n",
|
||||
"train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n",
|
||||
"\n",
|
||||
"query = '''\n",
|
||||
"SELECT\n",
|
||||
" id,\n",
|
||||
" activity,\n",
|
||||
" size(borrowers) as num_borrowers,\n",
|
||||
" male_proportion(borrowers) as male_proportion,\n",
|
||||
" lender_count,\n",
|
||||
" location.country,\n",
|
||||
" location.country_code,\n",
|
||||
" partner_id,\n",
|
||||
" sector,\n",
|
||||
" tags,\n",
|
||||
" DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n",
|
||||
" terms.disbursal_amount,\n",
|
||||
" terms.disbursal_currency,\n",
|
||||
" terms.disbursal_date,\n",
|
||||
" size(terms.scheduled_payments) as num_repayments,\n",
|
||||
" terms.repayment_interval,\n",
|
||||
" CASE WHEN\n",
|
||||
" (status = 'refunded') OR\n",
|
||||
" (status = 'defaulted') OR\n",
|
||||
" (status = 'deleted') OR\n",
|
||||
" (status = 'issue') OR\n",
|
||||
" (status = 'inactive_expired') OR\n",
|
||||
" (status = 'expired') OR\n",
|
||||
" (status = 'inactive') OR\n",
|
||||
" (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n",
|
||||
" status,\n",
|
||||
" delinquent\n",
|
||||
" \n",
|
||||
"FROM {}\n",
|
||||
"WHERE\n",
|
||||
" status != 'fundraising' AND\n",
|
||||
" status != 'funded'\n",
|
||||
"'''\n",
|
||||
"\n",
|
||||
"train.registerTempTable('loans_train')\n",
|
||||
"validation.registerTempTable('loans_validation')\n",
|
||||
"test.registerTempTable('loans_test')\n",
|
||||
"\n",
|
||||
"sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 2",
|
||||
"language": "python",
|
||||
"name": "python2"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
Loading…
Reference in New Issue
Block a user