diff --git a/.ipynb_checkpoints/Default Prediction-checkpoint.ipynb b/.ipynb_checkpoints/Default Prediction-checkpoint.ipynb new file mode 100644 index 0000000..174ecf3 --- /dev/null +++ b/.ipynb_checkpoints/Default Prediction-checkpoint.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sparkSql = (SparkSession.builder\n", + " .master(\"local\")\n", + " .appName(\"Kiva Exploration\")\n", + " .getOrCreate())\n", + "\n", + "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", + "loans.registerTempTable('loans')\n", + "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", + "lenders.registerTempTable('lenders')\n", + "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n", + "loans_lenders.registerTempTable('loans_lenders')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- activity: string (nullable = true)\n", + " |-- basket_amount: long (nullable = true)\n", + " |-- bonus_credit_eligibility: boolean (nullable = true)\n", + " |-- borrowers: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- first_name: string (nullable = true)\n", + " | | |-- gender: string (nullable = true)\n", + " | | |-- last_name: string (nullable = true)\n", + " | | |-- pictured: boolean (nullable = true)\n", + " |-- currency_exchange_loss_amount: double (nullable = true)\n", + " |-- delinquent: boolean (nullable = true)\n", + " |-- description: struct (nullable = true)\n", + " | |-- languages: array (nullable = true)\n", + " | | |-- element: string (containsNull = true)\n", + " | |-- texts: struct (nullable = true)\n", + " | | |-- ar: string (nullable = true)\n", + " | | |-- en: string (nullable = true)\n", + " | | |-- es: string (nullable = true)\n", + " | | |-- fr: string (nullable = true)\n", + " | | |-- id: string (nullable = true)\n", + " | | |-- mn: string (nullable = true)\n", + " | | |-- pt: string (nullable = true)\n", + " | | |-- ru: string (nullable = true)\n", + " | | |-- vi: string (nullable = true)\n", + " |-- funded_amount: long (nullable = true)\n", + " |-- funded_date: string (nullable = true)\n", + " |-- id: long (nullable = true)\n", + " |-- image: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- template_id: long (nullable = true)\n", + " |-- journal_totals: struct (nullable = true)\n", + " | |-- bulkEntries: long (nullable = true)\n", + " | |-- entries: long (nullable = true)\n", + " |-- lender_count: long (nullable = true)\n", + " |-- loan_amount: long (nullable = true)\n", + " |-- location: struct (nullable = true)\n", + " | |-- country: string (nullable = true)\n", + " | |-- country_code: string (nullable = true)\n", + " | |-- geo: struct (nullable = true)\n", + " | | |-- level: string (nullable = true)\n", + " | | |-- pairs: string (nullable = true)\n", + " | | |-- type: string (nullable = true)\n", + " | |-- town: string (nullable = true)\n", + " |-- name: string (nullable = true)\n", + " |-- paid_amount: double (nullable = true)\n", + " |-- paid_date: string (nullable = true)\n", + " |-- partner_id: long (nullable = true)\n", + " |-- payments: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- amount: double (nullable = true)\n", + " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", + " | | |-- local_amount: double (nullable = true)\n", + " | | |-- payment_id: long (nullable = true)\n", + " | | |-- processed_date: string (nullable = true)\n", + " | | |-- rounded_local_amount: double (nullable = true)\n", + " | | |-- settlement_date: string (nullable = true)\n", + " |-- planned_expiration_date: string (nullable = true)\n", + " |-- posted_date: string (nullable = true)\n", + " |-- sector: string (nullable = true)\n", + " |-- status: string (nullable = true)\n", + " |-- tags: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- name: string (nullable = true)\n", + " |-- terms: struct (nullable = true)\n", + " | |-- disbursal_amount: double (nullable = true)\n", + " | |-- disbursal_currency: string (nullable = true)\n", + " | |-- disbursal_date: string (nullable = true)\n", + " | |-- loan_amount: long (nullable = true)\n", + " | |-- local_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " | |-- loss_liability: struct (nullable = true)\n", + " | | |-- currency_exchange: string (nullable = true)\n", + " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", + " | | |-- nonpayment: string (nullable = true)\n", + " | |-- repayment_interval: string (nullable = true)\n", + " | |-- repayment_term: long (nullable = true)\n", + " | |-- scheduled_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " |-- themes: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- translator: struct (nullable = true)\n", + " | |-- byline: string (nullable = true)\n", + " | |-- image: long (nullable = true)\n", + " |-- use: string (nullable = true)\n", + " |-- video: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- thumbnailImageId: long (nullable = true)\n", + " | |-- title: string (nullable = true)\n", + " | |-- youtubeId: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "loans.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pyspark\n", + "\n", + "def male_proportion(array):\n", + " num_males = 0\n", + " for item in array:\n", + " if item.gender == 'M':\n", + " num_males += 1\n", + " \n", + " return float(num_males) / len(array)\n", + "\n", + "sparkSql.udf.register('male_proportion',\n", + " male_proportion,\n", + " pyspark.sql.types.FloatType())\n", + "\n", + "train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n", + "\n", + "query = '''\n", + "SELECT\n", + " id,\n", + " activity,\n", + " size(borrowers) as num_borrowers,\n", + " male_proportion(borrowers) as male_proportion,\n", + " lender_count,\n", + " location.country,\n", + " location.country_code,\n", + " partner_id,\n", + " sector,\n", + " tags,\n", + " DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n", + " terms.disbursal_amount,\n", + " terms.disbursal_currency,\n", + " terms.disbursal_date,\n", + " size(terms.scheduled_payments) as num_repayments,\n", + " terms.repayment_interval,\n", + " CASE WHEN\n", + " (status = 'refunded') OR\n", + " (status = 'defaulted') OR\n", + " (status = 'deleted') OR\n", + " (status = 'issue') OR\n", + " (status = 'inactive_expired') OR\n", + " (status = 'expired') OR\n", + " (status = 'inactive') OR\n", + " (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n", + " status,\n", + " delinquent\n", + " \n", + "FROM {}\n", + "WHERE\n", + " status != 'fundraising' AND\n", + " status != 'funded'\n", + "'''\n", + "\n", + "train.registerTempTable('loans_train')\n", + "validation.registerTempTable('loans_validation')\n", + "test.registerTempTable('loans_test')\n", + "\n", + "sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/Default Prediction.ipynb b/Default Prediction.ipynb new file mode 100644 index 0000000..174ecf3 --- /dev/null +++ b/Default Prediction.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sparkSql = (SparkSession.builder\n", + " .master(\"local\")\n", + " .appName(\"Kiva Exploration\")\n", + " .getOrCreate())\n", + "\n", + "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", + "loans.registerTempTable('loans')\n", + "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", + "lenders.registerTempTable('lenders')\n", + "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n", + "loans_lenders.registerTempTable('loans_lenders')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- activity: string (nullable = true)\n", + " |-- basket_amount: long (nullable = true)\n", + " |-- bonus_credit_eligibility: boolean (nullable = true)\n", + " |-- borrowers: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- first_name: string (nullable = true)\n", + " | | |-- gender: string (nullable = true)\n", + " | | |-- last_name: string (nullable = true)\n", + " | | |-- pictured: boolean (nullable = true)\n", + " |-- currency_exchange_loss_amount: double (nullable = true)\n", + " |-- delinquent: boolean (nullable = true)\n", + " |-- description: struct (nullable = true)\n", + " | |-- languages: array (nullable = true)\n", + " | | |-- element: string (containsNull = true)\n", + " | |-- texts: struct (nullable = true)\n", + " | | |-- ar: string (nullable = true)\n", + " | | |-- en: string (nullable = true)\n", + " | | |-- es: string (nullable = true)\n", + " | | |-- fr: string (nullable = true)\n", + " | | |-- id: string (nullable = true)\n", + " | | |-- mn: string (nullable = true)\n", + " | | |-- pt: string (nullable = true)\n", + " | | |-- ru: string (nullable = true)\n", + " | | |-- vi: string (nullable = true)\n", + " |-- funded_amount: long (nullable = true)\n", + " |-- funded_date: string (nullable = true)\n", + " |-- id: long (nullable = true)\n", + " |-- image: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- template_id: long (nullable = true)\n", + " |-- journal_totals: struct (nullable = true)\n", + " | |-- bulkEntries: long (nullable = true)\n", + " | |-- entries: long (nullable = true)\n", + " |-- lender_count: long (nullable = true)\n", + " |-- loan_amount: long (nullable = true)\n", + " |-- location: struct (nullable = true)\n", + " | |-- country: string (nullable = true)\n", + " | |-- country_code: string (nullable = true)\n", + " | |-- geo: struct (nullable = true)\n", + " | | |-- level: string (nullable = true)\n", + " | | |-- pairs: string (nullable = true)\n", + " | | |-- type: string (nullable = true)\n", + " | |-- town: string (nullable = true)\n", + " |-- name: string (nullable = true)\n", + " |-- paid_amount: double (nullable = true)\n", + " |-- paid_date: string (nullable = true)\n", + " |-- partner_id: long (nullable = true)\n", + " |-- payments: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- amount: double (nullable = true)\n", + " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", + " | | |-- local_amount: double (nullable = true)\n", + " | | |-- payment_id: long (nullable = true)\n", + " | | |-- processed_date: string (nullable = true)\n", + " | | |-- rounded_local_amount: double (nullable = true)\n", + " | | |-- settlement_date: string (nullable = true)\n", + " |-- planned_expiration_date: string (nullable = true)\n", + " |-- posted_date: string (nullable = true)\n", + " |-- sector: string (nullable = true)\n", + " |-- status: string (nullable = true)\n", + " |-- tags: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- name: string (nullable = true)\n", + " |-- terms: struct (nullable = true)\n", + " | |-- disbursal_amount: double (nullable = true)\n", + " | |-- disbursal_currency: string (nullable = true)\n", + " | |-- disbursal_date: string (nullable = true)\n", + " | |-- loan_amount: long (nullable = true)\n", + " | |-- local_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " | |-- loss_liability: struct (nullable = true)\n", + " | | |-- currency_exchange: string (nullable = true)\n", + " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", + " | | |-- nonpayment: string (nullable = true)\n", + " | |-- repayment_interval: string (nullable = true)\n", + " | |-- repayment_term: long (nullable = true)\n", + " | |-- scheduled_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " |-- themes: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- translator: struct (nullable = true)\n", + " | |-- byline: string (nullable = true)\n", + " | |-- image: long (nullable = true)\n", + " |-- use: string (nullable = true)\n", + " |-- video: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- thumbnailImageId: long (nullable = true)\n", + " | |-- title: string (nullable = true)\n", + " | |-- youtubeId: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "loans.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pyspark\n", + "\n", + "def male_proportion(array):\n", + " num_males = 0\n", + " for item in array:\n", + " if item.gender == 'M':\n", + " num_males += 1\n", + " \n", + " return float(num_males) / len(array)\n", + "\n", + "sparkSql.udf.register('male_proportion',\n", + " male_proportion,\n", + " pyspark.sql.types.FloatType())\n", + "\n", + "train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n", + "\n", + "query = '''\n", + "SELECT\n", + " id,\n", + " activity,\n", + " size(borrowers) as num_borrowers,\n", + " male_proportion(borrowers) as male_proportion,\n", + " lender_count,\n", + " location.country,\n", + " location.country_code,\n", + " partner_id,\n", + " sector,\n", + " tags,\n", + " DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n", + " terms.disbursal_amount,\n", + " terms.disbursal_currency,\n", + " terms.disbursal_date,\n", + " size(terms.scheduled_payments) as num_repayments,\n", + " terms.repayment_interval,\n", + " CASE WHEN\n", + " (status = 'refunded') OR\n", + " (status = 'defaulted') OR\n", + " (status = 'deleted') OR\n", + " (status = 'issue') OR\n", + " (status = 'inactive_expired') OR\n", + " (status = 'expired') OR\n", + " (status = 'inactive') OR\n", + " (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n", + " status,\n", + " delinquent\n", + " \n", + "FROM {}\n", + "WHERE\n", + " status != 'fundraising' AND\n", + " status != 'funded'\n", + "'''\n", + "\n", + "train.registerTempTable('loans_train')\n", + "validation.registerTempTable('loans_validation')\n", + "test.registerTempTable('loans_test')\n", + "\n", + "sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}