{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Understanding the Kiva Dataset\n", "\n", "Before we actually get into the work of predicting anything based on the data Kiva makes public, we first want to get a better picture of what the dataset actually looks like.\n", "\n", "Our first step: What is the schema of the data? Spark SQL will make it easy to query data in the future, but we need to know first what is available." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sparkSql = (SparkSession.builder\n", " .master(\"local\")\n", " .appName(\"Kiva Exploration\")\n", " .getOrCreate())\n", "\n", "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- activity: string (nullable = true)\n", " |-- basket_amount: long (nullable = true)\n", " |-- bonus_credit_eligibility: boolean (nullable = true)\n", " |-- borrowers: array (nullable = true)\n", " | |-- element: struct (containsNull = true)\n", " | | |-- first_name: string (nullable = true)\n", " | | |-- gender: string (nullable = true)\n", " | | |-- last_name: string (nullable = true)\n", " | | |-- pictured: boolean (nullable = true)\n", " |-- currency_exchange_loss_amount: double (nullable = true)\n", " |-- delinquent: boolean (nullable = true)\n", " |-- description: struct (nullable = true)\n", " | |-- languages: array (nullable = true)\n", " | | |-- element: string (containsNull = true)\n", " | |-- texts: struct (nullable = true)\n", " | | |-- ar: string (nullable = true)\n", " | | |-- en: string (nullable = true)\n", " | | |-- es: string (nullable = true)\n", " | | |-- fr: string (nullable = true)\n", " | | |-- id: string (nullable = true)\n", " | | |-- mn: string (nullable = true)\n", " | | |-- pt: string (nullable = true)\n", " | | |-- ru: string (nullable = true)\n", " | | |-- vi: string (nullable = true)\n", " |-- funded_amount: long (nullable = true)\n", " |-- funded_date: string (nullable = true)\n", " |-- id: long (nullable = true)\n", " |-- image: struct (nullable = true)\n", " | |-- id: long (nullable = true)\n", " | |-- template_id: long (nullable = true)\n", " |-- journal_totals: struct (nullable = true)\n", " | |-- bulkEntries: long (nullable = true)\n", " | |-- entries: long (nullable = true)\n", " |-- lender_count: long (nullable = true)\n", " |-- loan_amount: long (nullable = true)\n", " |-- location: struct (nullable = true)\n", " | |-- country: string (nullable = true)\n", " | |-- country_code: string (nullable = true)\n", " | |-- geo: struct (nullable = true)\n", " | | |-- level: string (nullable = true)\n", " | | |-- pairs: string (nullable = true)\n", " | | |-- type: string (nullable = true)\n", " | |-- town: string (nullable = true)\n", " |-- name: string (nullable = true)\n", " |-- paid_amount: double (nullable = true)\n", " |-- paid_date: string (nullable = true)\n", " |-- partner_id: long (nullable = true)\n", " |-- payments: array (nullable = true)\n", " | |-- element: struct (containsNull = true)\n", " | | |-- amount: double (nullable = true)\n", " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", " | | |-- local_amount: double (nullable = true)\n", " | | |-- payment_id: long (nullable = true)\n", " | | |-- processed_date: string (nullable = true)\n", " | | |-- rounded_local_amount: double (nullable = true)\n", " | | |-- settlement_date: string (nullable = true)\n", " |-- planned_expiration_date: string (nullable = true)\n", " |-- posted_date: string (nullable = true)\n", " |-- sector: string (nullable = true)\n", " |-- status: string (nullable = true)\n", " |-- tags: array (nullable = true)\n", " | |-- element: struct (containsNull = true)\n", " | | |-- name: string (nullable = true)\n", " |-- terms: struct (nullable = true)\n", " | |-- disbursal_amount: double (nullable = true)\n", " | |-- disbursal_currency: string (nullable = true)\n", " | |-- disbursal_date: string (nullable = true)\n", " | |-- loan_amount: long (nullable = true)\n", " | |-- local_payments: array (nullable = true)\n", " | | |-- element: struct (containsNull = true)\n", " | | | |-- amount: double (nullable = true)\n", " | | | |-- due_date: string (nullable = true)\n", " | |-- loss_liability: struct (nullable = true)\n", " | | |-- currency_exchange: string (nullable = true)\n", " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", " | | |-- nonpayment: string (nullable = true)\n", " | |-- repayment_interval: string (nullable = true)\n", " | |-- repayment_term: long (nullable = true)\n", " | |-- scheduled_payments: array (nullable = true)\n", " | | |-- element: struct (containsNull = true)\n", " | | | |-- amount: double (nullable = true)\n", " | | | |-- due_date: string (nullable = true)\n", " |-- themes: array (nullable = true)\n", " | |-- element: string (containsNull = true)\n", " |-- translator: struct (nullable = true)\n", " | |-- byline: string (nullable = true)\n", " | |-- image: long (nullable = true)\n", " |-- use: string (nullable = true)\n", " |-- video: struct (nullable = true)\n", " | |-- id: long (nullable = true)\n", " | |-- thumbnailImageId: long (nullable = true)\n", " | |-- title: string (nullable = true)\n", " | |-- youtubeId: string (nullable = true)\n", "\n" ] } ], "source": [ "loans.printSchema()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[Row(status=u'refunded', count=5504),\n", " Row(status=u'defaulted', count=21776),\n", " Row(status=u'in_repayment', count=155749),\n", " Row(status=u'reviewed', count=3),\n", " Row(status=u'deleted', count=2721),\n", " Row(status=u'paid', count=775330),\n", " Row(status=u'issue', count=199),\n", " Row(status=u'inactive_expired', count=12421),\n", " Row(status=u'fundraising', count=3986),\n", " Row(status=u'expired', count=33773),\n", " Row(status=u'inactive', count=2493),\n", " Row(status=u'funded', count=173),\n", " Row(status=u'', count=2)]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans.groupby(loans.status).count().collect()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[Row(delinquent=None, count=970465), Row(delinquent=True, count=43665)]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans.groupby(loans.delinquent).count().collect()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[Row(status=u'refunded', count=156),\n", " Row(status=u'defaulted', count=20116),\n", " Row(status=u'in_repayment', count=23393)]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans.where(loans.delinquent == True).groupby(loans.status).count().collect()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[Row(status=u'in_repayment')]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans.registerTempTable('loans')\n", "sparkSql.sql('''\n", "SELECT loans.status\n", "FROM loans\n", "LIMIT 1\n", "''').collect()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }