1
0
mirror of https://github.com/bspeice/kiva-dig synced 2024-12-03 20:28:10 -05:00

Initial code for predicting defaults

Currently just selects out the data we need from the giant JSON file
This commit is contained in:
Bradlee Speice 2016-11-05 15:10:59 -04:00
parent 6a22e5ece7
commit e7d0a98bc5
2 changed files with 452 additions and 0 deletions

View File

@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sparkSql = (SparkSession.builder\n",
" .master(\"local\")\n",
" .appName(\"Kiva Exploration\")\n",
" .getOrCreate())\n",
"\n",
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
"loans.registerTempTable('loans')\n",
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
"lenders.registerTempTable('lenders')\n",
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
"loans_lenders.registerTempTable('loans_lenders')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- activity: string (nullable = true)\n",
" |-- basket_amount: long (nullable = true)\n",
" |-- bonus_credit_eligibility: boolean (nullable = true)\n",
" |-- borrowers: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- first_name: string (nullable = true)\n",
" | | |-- gender: string (nullable = true)\n",
" | | |-- last_name: string (nullable = true)\n",
" | | |-- pictured: boolean (nullable = true)\n",
" |-- currency_exchange_loss_amount: double (nullable = true)\n",
" |-- delinquent: boolean (nullable = true)\n",
" |-- description: struct (nullable = true)\n",
" | |-- languages: array (nullable = true)\n",
" | | |-- element: string (containsNull = true)\n",
" | |-- texts: struct (nullable = true)\n",
" | | |-- ar: string (nullable = true)\n",
" | | |-- en: string (nullable = true)\n",
" | | |-- es: string (nullable = true)\n",
" | | |-- fr: string (nullable = true)\n",
" | | |-- id: string (nullable = true)\n",
" | | |-- mn: string (nullable = true)\n",
" | | |-- pt: string (nullable = true)\n",
" | | |-- ru: string (nullable = true)\n",
" | | |-- vi: string (nullable = true)\n",
" |-- funded_amount: long (nullable = true)\n",
" |-- funded_date: string (nullable = true)\n",
" |-- id: long (nullable = true)\n",
" |-- image: struct (nullable = true)\n",
" | |-- id: long (nullable = true)\n",
" | |-- template_id: long (nullable = true)\n",
" |-- journal_totals: struct (nullable = true)\n",
" | |-- bulkEntries: long (nullable = true)\n",
" | |-- entries: long (nullable = true)\n",
" |-- lender_count: long (nullable = true)\n",
" |-- loan_amount: long (nullable = true)\n",
" |-- location: struct (nullable = true)\n",
" | |-- country: string (nullable = true)\n",
" | |-- country_code: string (nullable = true)\n",
" | |-- geo: struct (nullable = true)\n",
" | | |-- level: string (nullable = true)\n",
" | | |-- pairs: string (nullable = true)\n",
" | | |-- type: string (nullable = true)\n",
" | |-- town: string (nullable = true)\n",
" |-- name: string (nullable = true)\n",
" |-- paid_amount: double (nullable = true)\n",
" |-- paid_date: string (nullable = true)\n",
" |-- partner_id: long (nullable = true)\n",
" |-- payments: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- amount: double (nullable = true)\n",
" | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
" | | |-- local_amount: double (nullable = true)\n",
" | | |-- payment_id: long (nullable = true)\n",
" | | |-- processed_date: string (nullable = true)\n",
" | | |-- rounded_local_amount: double (nullable = true)\n",
" | | |-- settlement_date: string (nullable = true)\n",
" |-- planned_expiration_date: string (nullable = true)\n",
" |-- posted_date: string (nullable = true)\n",
" |-- sector: string (nullable = true)\n",
" |-- status: string (nullable = true)\n",
" |-- tags: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- name: string (nullable = true)\n",
" |-- terms: struct (nullable = true)\n",
" | |-- disbursal_amount: double (nullable = true)\n",
" | |-- disbursal_currency: string (nullable = true)\n",
" | |-- disbursal_date: string (nullable = true)\n",
" | |-- loan_amount: long (nullable = true)\n",
" | |-- local_payments: array (nullable = true)\n",
" | | |-- element: struct (containsNull = true)\n",
" | | | |-- amount: double (nullable = true)\n",
" | | | |-- due_date: string (nullable = true)\n",
" | |-- loss_liability: struct (nullable = true)\n",
" | | |-- currency_exchange: string (nullable = true)\n",
" | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
" | | |-- nonpayment: string (nullable = true)\n",
" | |-- repayment_interval: string (nullable = true)\n",
" | |-- repayment_term: long (nullable = true)\n",
" | |-- scheduled_payments: array (nullable = true)\n",
" | | |-- element: struct (containsNull = true)\n",
" | | | |-- amount: double (nullable = true)\n",
" | | | |-- due_date: string (nullable = true)\n",
" |-- themes: array (nullable = true)\n",
" | |-- element: string (containsNull = true)\n",
" |-- translator: struct (nullable = true)\n",
" | |-- byline: string (nullable = true)\n",
" | |-- image: long (nullable = true)\n",
" |-- use: string (nullable = true)\n",
" |-- video: struct (nullable = true)\n",
" | |-- id: long (nullable = true)\n",
" | |-- thumbnailImageId: long (nullable = true)\n",
" | |-- title: string (nullable = true)\n",
" | |-- youtubeId: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"loans.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pyspark\n",
"\n",
"def male_proportion(array):\n",
" num_males = 0\n",
" for item in array:\n",
" if item.gender == 'M':\n",
" num_males += 1\n",
" \n",
" return float(num_males) / len(array)\n",
"\n",
"sparkSql.udf.register('male_proportion',\n",
" male_proportion,\n",
" pyspark.sql.types.FloatType())\n",
"\n",
"train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n",
"\n",
"query = '''\n",
"SELECT\n",
" id,\n",
" activity,\n",
" size(borrowers) as num_borrowers,\n",
" male_proportion(borrowers) as male_proportion,\n",
" lender_count,\n",
" location.country,\n",
" location.country_code,\n",
" partner_id,\n",
" sector,\n",
" tags,\n",
" DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n",
" terms.disbursal_amount,\n",
" terms.disbursal_currency,\n",
" terms.disbursal_date,\n",
" size(terms.scheduled_payments) as num_repayments,\n",
" terms.repayment_interval,\n",
" CASE WHEN\n",
" (status = 'refunded') OR\n",
" (status = 'defaulted') OR\n",
" (status = 'deleted') OR\n",
" (status = 'issue') OR\n",
" (status = 'inactive_expired') OR\n",
" (status = 'expired') OR\n",
" (status = 'inactive') OR\n",
" (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n",
" status,\n",
" delinquent\n",
" \n",
"FROM {}\n",
"WHERE\n",
" status != 'fundraising' AND\n",
" status != 'funded'\n",
"'''\n",
"\n",
"train.registerTempTable('loans_train')\n",
"validation.registerTempTable('loans_validation')\n",
"test.registerTempTable('loans_test')\n",
"\n",
"sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

226
Default Prediction.ipynb Normal file
View File

@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sparkSql = (SparkSession.builder\n",
" .master(\"local\")\n",
" .appName(\"Kiva Exploration\")\n",
" .getOrCreate())\n",
"\n",
"loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n",
"loans.registerTempTable('loans')\n",
"lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n",
"lenders.registerTempTable('lenders')\n",
"loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n",
"loans_lenders.registerTempTable('loans_lenders')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- activity: string (nullable = true)\n",
" |-- basket_amount: long (nullable = true)\n",
" |-- bonus_credit_eligibility: boolean (nullable = true)\n",
" |-- borrowers: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- first_name: string (nullable = true)\n",
" | | |-- gender: string (nullable = true)\n",
" | | |-- last_name: string (nullable = true)\n",
" | | |-- pictured: boolean (nullable = true)\n",
" |-- currency_exchange_loss_amount: double (nullable = true)\n",
" |-- delinquent: boolean (nullable = true)\n",
" |-- description: struct (nullable = true)\n",
" | |-- languages: array (nullable = true)\n",
" | | |-- element: string (containsNull = true)\n",
" | |-- texts: struct (nullable = true)\n",
" | | |-- ar: string (nullable = true)\n",
" | | |-- en: string (nullable = true)\n",
" | | |-- es: string (nullable = true)\n",
" | | |-- fr: string (nullable = true)\n",
" | | |-- id: string (nullable = true)\n",
" | | |-- mn: string (nullable = true)\n",
" | | |-- pt: string (nullable = true)\n",
" | | |-- ru: string (nullable = true)\n",
" | | |-- vi: string (nullable = true)\n",
" |-- funded_amount: long (nullable = true)\n",
" |-- funded_date: string (nullable = true)\n",
" |-- id: long (nullable = true)\n",
" |-- image: struct (nullable = true)\n",
" | |-- id: long (nullable = true)\n",
" | |-- template_id: long (nullable = true)\n",
" |-- journal_totals: struct (nullable = true)\n",
" | |-- bulkEntries: long (nullable = true)\n",
" | |-- entries: long (nullable = true)\n",
" |-- lender_count: long (nullable = true)\n",
" |-- loan_amount: long (nullable = true)\n",
" |-- location: struct (nullable = true)\n",
" | |-- country: string (nullable = true)\n",
" | |-- country_code: string (nullable = true)\n",
" | |-- geo: struct (nullable = true)\n",
" | | |-- level: string (nullable = true)\n",
" | | |-- pairs: string (nullable = true)\n",
" | | |-- type: string (nullable = true)\n",
" | |-- town: string (nullable = true)\n",
" |-- name: string (nullable = true)\n",
" |-- paid_amount: double (nullable = true)\n",
" |-- paid_date: string (nullable = true)\n",
" |-- partner_id: long (nullable = true)\n",
" |-- payments: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- amount: double (nullable = true)\n",
" | | |-- currency_exchange_loss_amount: double (nullable = true)\n",
" | | |-- local_amount: double (nullable = true)\n",
" | | |-- payment_id: long (nullable = true)\n",
" | | |-- processed_date: string (nullable = true)\n",
" | | |-- rounded_local_amount: double (nullable = true)\n",
" | | |-- settlement_date: string (nullable = true)\n",
" |-- planned_expiration_date: string (nullable = true)\n",
" |-- posted_date: string (nullable = true)\n",
" |-- sector: string (nullable = true)\n",
" |-- status: string (nullable = true)\n",
" |-- tags: array (nullable = true)\n",
" | |-- element: struct (containsNull = true)\n",
" | | |-- name: string (nullable = true)\n",
" |-- terms: struct (nullable = true)\n",
" | |-- disbursal_amount: double (nullable = true)\n",
" | |-- disbursal_currency: string (nullable = true)\n",
" | |-- disbursal_date: string (nullable = true)\n",
" | |-- loan_amount: long (nullable = true)\n",
" | |-- local_payments: array (nullable = true)\n",
" | | |-- element: struct (containsNull = true)\n",
" | | | |-- amount: double (nullable = true)\n",
" | | | |-- due_date: string (nullable = true)\n",
" | |-- loss_liability: struct (nullable = true)\n",
" | | |-- currency_exchange: string (nullable = true)\n",
" | | |-- currency_exchange_coverage_rate: double (nullable = true)\n",
" | | |-- nonpayment: string (nullable = true)\n",
" | |-- repayment_interval: string (nullable = true)\n",
" | |-- repayment_term: long (nullable = true)\n",
" | |-- scheduled_payments: array (nullable = true)\n",
" | | |-- element: struct (containsNull = true)\n",
" | | | |-- amount: double (nullable = true)\n",
" | | | |-- due_date: string (nullable = true)\n",
" |-- themes: array (nullable = true)\n",
" | |-- element: string (containsNull = true)\n",
" |-- translator: struct (nullable = true)\n",
" | |-- byline: string (nullable = true)\n",
" | |-- image: long (nullable = true)\n",
" |-- use: string (nullable = true)\n",
" |-- video: struct (nullable = true)\n",
" | |-- id: long (nullable = true)\n",
" | |-- thumbnailImageId: long (nullable = true)\n",
" | |-- title: string (nullable = true)\n",
" | |-- youtubeId: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"loans.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pyspark\n",
"\n",
"def male_proportion(array):\n",
" num_males = 0\n",
" for item in array:\n",
" if item.gender == 'M':\n",
" num_males += 1\n",
" \n",
" return float(num_males) / len(array)\n",
"\n",
"sparkSql.udf.register('male_proportion',\n",
" male_proportion,\n",
" pyspark.sql.types.FloatType())\n",
"\n",
"train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n",
"\n",
"query = '''\n",
"SELECT\n",
" id,\n",
" activity,\n",
" size(borrowers) as num_borrowers,\n",
" male_proportion(borrowers) as male_proportion,\n",
" lender_count,\n",
" location.country,\n",
" location.country_code,\n",
" partner_id,\n",
" sector,\n",
" tags,\n",
" DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n",
" terms.disbursal_amount,\n",
" terms.disbursal_currency,\n",
" terms.disbursal_date,\n",
" size(terms.scheduled_payments) as num_repayments,\n",
" terms.repayment_interval,\n",
" CASE WHEN\n",
" (status = 'refunded') OR\n",
" (status = 'defaulted') OR\n",
" (status = 'deleted') OR\n",
" (status = 'issue') OR\n",
" (status = 'inactive_expired') OR\n",
" (status = 'expired') OR\n",
" (status = 'inactive') OR\n",
" (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n",
" status,\n",
" delinquent\n",
" \n",
"FROM {}\n",
"WHERE\n",
" status != 'fundraising' AND\n",
" status != 'funded'\n",
"'''\n",
"\n",
"train.registerTempTable('loans_train')\n",
"validation.registerTempTable('loans_validation')\n",
"test.registerTempTable('loans_test')\n",
"\n",
"sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}