{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sparkSql = (SparkSession.builder\n", " .master(\"local\")\n", " .appName(\"Kiva Exploration\")\n", " .getOrCreate())\n", "\n", "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", "loans.registerTempTable('loans')\n", "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", "lenders.registerTempTable('lenders')\n", "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')\n", "loans_lenders.registerTempTable('loans_lenders')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- activity: string (nullable = true)\n", " |-- basket_amount: long (nullable = true)\n", " |-- bonus_credit_eligibility: boolean (nullable = true)\n", " |-- borrowers: array (nullable = true)\n", " | |-- element: struct (containsNull = true)\n", " | | |-- first_name: string (nullable = true)\n", " | | |-- gender: string (nullable = true)\n", " | | |-- last_name: string (nullable = true)\n", " | | |-- pictured: boolean (nullable = true)\n", " |-- currency_exchange_loss_amount: double (nullable = true)\n", " |-- delinquent: boolean (nullable = true)\n", " |-- description: struct (nullable = true)\n", " | |-- languages: array (nullable = true)\n", " | | |-- element: string (containsNull = true)\n", " | |-- texts: struct (nullable = true)\n", " | | |-- ar: string (nullable = true)\n", " | | |-- en: string (nullable = true)\n", " | | |-- es: string (nullable = true)\n", " | | |-- fr: string (nullable = true)\n", " | | |-- id: string (nullable = true)\n", " | | |-- mn: string (nullable = true)\n", " | | |-- pt: string (nullable = true)\n", " | | |-- ru: string (nullable = true)\n", " | | |-- vi: string (nullable = true)\n", " |-- funded_amount: long (nullable = true)\n", " |-- funded_date: string (nullable = true)\n", " |-- id: long (nullable = true)\n", " |-- image: struct (nullable = true)\n", " | |-- id: long (nullable = true)\n", " | |-- template_id: long (nullable = true)\n", " |-- journal_totals: struct (nullable = true)\n", " | |-- bulkEntries: long (nullable = true)\n", " | |-- entries: long (nullable = true)\n", " |-- lender_count: long (nullable = true)\n", " |-- loan_amount: long (nullable = true)\n", " |-- location: struct (nullable = true)\n", " | |-- country: string (nullable = true)\n", " | |-- country_code: string (nullable = true)\n", " | |-- geo: struct (nullable = true)\n", " | | |-- level: string (nullable = true)\n", " | | |-- pairs: string (nullable = true)\n", " | | |-- type: string (nullable = true)\n", " | |-- town: string (nullable = true)\n", " |-- name: string (nullable = true)\n", " |-- paid_amount: double (nullable = true)\n", " |-- paid_date: string (nullable = true)\n", " |-- partner_id: long (nullable = true)\n", " |-- payments: array (nullable = true)\n", " | |-- element: struct (containsNull = true)\n", " | | |-- amount: double (nullable = true)\n", " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", " | | |-- local_amount: double (nullable = true)\n", " | | |-- payment_id: long (nullable = true)\n", " | | |-- processed_date: string (nullable = true)\n", " | | |-- rounded_local_amount: double (nullable = true)\n", " | | |-- settlement_date: string (nullable = true)\n", " |-- planned_expiration_date: string (nullable = true)\n", " |-- posted_date: string (nullable = true)\n", " |-- sector: string (nullable = true)\n", " |-- status: string (nullable = true)\n", " |-- tags: array (nullable = true)\n", " | |-- element: struct (containsNull = true)\n", " | | |-- name: string (nullable = true)\n", " |-- terms: struct (nullable = true)\n", " | |-- disbursal_amount: double (nullable = true)\n", " | |-- disbursal_currency: string (nullable = true)\n", " | |-- disbursal_date: string (nullable = true)\n", " | |-- loan_amount: long (nullable = true)\n", " | |-- local_payments: array (nullable = true)\n", " | | |-- element: struct (containsNull = true)\n", " | | | |-- amount: double (nullable = true)\n", " | | | |-- due_date: string (nullable = true)\n", " | |-- loss_liability: struct (nullable = true)\n", " | | |-- currency_exchange: string (nullable = true)\n", " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", " | | |-- nonpayment: string (nullable = true)\n", " | |-- repayment_interval: string (nullable = true)\n", " | |-- repayment_term: long (nullable = true)\n", " | |-- scheduled_payments: array (nullable = true)\n", " | | |-- element: struct (containsNull = true)\n", " | | | |-- amount: double (nullable = true)\n", " | | | |-- due_date: string (nullable = true)\n", " |-- themes: array (nullable = true)\n", " | |-- element: string (containsNull = true)\n", " |-- translator: struct (nullable = true)\n", " | |-- byline: string (nullable = true)\n", " | |-- image: long (nullable = true)\n", " |-- use: string (nullable = true)\n", " |-- video: struct (nullable = true)\n", " | |-- id: long (nullable = true)\n", " | |-- thumbnailImageId: long (nullable = true)\n", " | |-- title: string (nullable = true)\n", " | |-- youtubeId: string (nullable = true)\n", "\n" ] } ], "source": [ "loans.printSchema()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pyspark\n", "\n", "def male_proportion(array):\n", " num_males = 0\n", " for item in array:\n", " if item.gender == 'M':\n", " num_males += 1\n", " \n", " return float(num_males) / len(array)\n", "\n", "sparkSql.udf.register('male_proportion',\n", " male_proportion,\n", " pyspark.sql.types.FloatType())\n", "\n", "train, validation, test = loans.randomSplit([.6, .2, .2], 101)\n", "\n", "query = '''\n", "SELECT\n", " id,\n", " activity,\n", " size(borrowers) as num_borrowers,\n", " male_proportion(borrowers) as male_proportion,\n", " lender_count,\n", " location.country,\n", " location.country_code,\n", " partner_id,\n", " sector,\n", " tags,\n", " DATEDIFF(terms.disbursal_date, planned_expiration_date) as loan_length,\n", " terms.disbursal_amount,\n", " terms.disbursal_currency,\n", " terms.disbursal_date,\n", " size(terms.scheduled_payments) as num_repayments,\n", " terms.repayment_interval,\n", " CASE WHEN\n", " (status = 'refunded') OR\n", " (status = 'defaulted') OR\n", " (status = 'deleted') OR\n", " (status = 'issue') OR\n", " (status = 'inactive_expired') OR\n", " (status = 'expired') OR\n", " (status = 'inactive') OR\n", " (delinquent = True) THEN 1 ELSE 0 END AS bad_loan,\n", " status,\n", " delinquent\n", " \n", "FROM {}\n", "WHERE\n", " status != 'fundraising' AND\n", " status != 'funded'\n", "'''\n", "\n", "train.registerTempTable('loans_train')\n", "validation.registerTempTable('loans_validation')\n", "test.registerTempTable('loans_test')\n", "\n", "sparkSql.sql(query.format('loans_validation')).write.json('validation_data-filtered.json')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }