From f32e0b37f19b1a4585d32ca5c488ea639ce360de Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Wed, 2 Nov 2016 18:31:44 -0400 Subject: [PATCH] Add a basic SQL example --- .../Kiva Datatypes-checkpoint.ipynb | 157 ---------- Kiva Datatypes.ipynb | 157 ---------- Kiva Exploration.ipynb | 274 ++++++++++++++++++ 3 files changed, 274 insertions(+), 314 deletions(-) delete mode 100644 .ipynb_checkpoints/Kiva Datatypes-checkpoint.ipynb delete mode 100644 Kiva Datatypes.ipynb create mode 100644 Kiva Exploration.ipynb diff --git a/.ipynb_checkpoints/Kiva Datatypes-checkpoint.ipynb b/.ipynb_checkpoints/Kiva Datatypes-checkpoint.ipynb deleted file mode 100644 index b05499c..0000000 --- a/.ipynb_checkpoints/Kiva Datatypes-checkpoint.ipynb +++ /dev/null @@ -1,157 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sparkSql = (SparkSession.builder\n", - " .master(\"local\")\n", - " .appName(\"Kiva Exploration\")\n", - " .getOrCreate())\n", - "\n", - "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", - "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", - "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('activity', 'string'),\n", - " ('arrears_amount', 'string'),\n", - " ('basket_amount', 'bigint'),\n", - " ('bonus_credit_eligibility', 'boolean'),\n", - " ('borrowers',\n", - " 'array>'),\n", - " ('currency_exchange_loss_amount', 'double'),\n", - " ('delinquent', 'string'),\n", - " ('description',\n", - " 'struct,texts:struct>'),\n", - " ('funded_amount', 'bigint'),\n", - " ('funded_date', 'string'),\n", - " ('id', 'bigint'),\n", - " ('image', 'struct'),\n", - " ('journal_totals', 'struct'),\n", - " ('lender_count', 'bigint'),\n", - " ('loan_amount', 'bigint'),\n", - " ('location',\n", - " 'struct,town:string>'),\n", - " ('name', 'string'),\n", - " ('paid_amount', 'string'),\n", - " ('paid_date', 'string'),\n", - " ('partner_id', 'bigint'),\n", - " ('payments', 'array'),\n", - " ('planned_expiration_date', 'string'),\n", - " ('posted_date', 'string'),\n", - " ('sector', 'string'),\n", - " ('status', 'string'),\n", - " ('tags', 'array>'),\n", - " ('terms',\n", - " 'struct>,loss_liability:struct,repayment_interval:string,repayment_term:bigint,scheduled_payments:array>>'),\n", - " ('themes', 'array'),\n", - " ('translator', 'struct'),\n", - " ('use', 'string'),\n", - " ('video',\n", - " 'struct')]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "loans.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('country_code', 'string'),\n", - " ('image', 'struct'),\n", - " ('invitee_count', 'bigint'),\n", - " ('inviter_id', 'string'),\n", - " ('lender_id', 'string'),\n", - " ('loan_because', 'string'),\n", - " ('loan_count', 'bigint'),\n", - " ('member_since', 'string'),\n", - " ('name', 'string'),\n", - " ('occupation', 'string'),\n", - " ('occupational_info', 'string'),\n", - " ('personal_url', 'string'),\n", - " ('uid', 'string'),\n", - " ('whereabouts', 'string')]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lenders.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('id', 'bigint'), ('lender_ids', 'array')]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "loans_lenders.dtypes" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/Kiva Datatypes.ipynb b/Kiva Datatypes.ipynb deleted file mode 100644 index b05499c..0000000 --- a/Kiva Datatypes.ipynb +++ /dev/null @@ -1,157 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sparkSql = (SparkSession.builder\n", - " .master(\"local\")\n", - " .appName(\"Kiva Exploration\")\n", - " .getOrCreate())\n", - "\n", - "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", - "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", - "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('activity', 'string'),\n", - " ('arrears_amount', 'string'),\n", - " ('basket_amount', 'bigint'),\n", - " ('bonus_credit_eligibility', 'boolean'),\n", - " ('borrowers',\n", - " 'array>'),\n", - " ('currency_exchange_loss_amount', 'double'),\n", - " ('delinquent', 'string'),\n", - " ('description',\n", - " 'struct,texts:struct>'),\n", - " ('funded_amount', 'bigint'),\n", - " ('funded_date', 'string'),\n", - " ('id', 'bigint'),\n", - " ('image', 'struct'),\n", - " ('journal_totals', 'struct'),\n", - " ('lender_count', 'bigint'),\n", - " ('loan_amount', 'bigint'),\n", - " ('location',\n", - " 'struct,town:string>'),\n", - " ('name', 'string'),\n", - " ('paid_amount', 'string'),\n", - " ('paid_date', 'string'),\n", - " ('partner_id', 'bigint'),\n", - " ('payments', 'array'),\n", - " ('planned_expiration_date', 'string'),\n", - " ('posted_date', 'string'),\n", - " ('sector', 'string'),\n", - " ('status', 'string'),\n", - " ('tags', 'array>'),\n", - " ('terms',\n", - " 'struct>,loss_liability:struct,repayment_interval:string,repayment_term:bigint,scheduled_payments:array>>'),\n", - " ('themes', 'array'),\n", - " ('translator', 'struct'),\n", - " ('use', 'string'),\n", - " ('video',\n", - " 'struct')]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "loans.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('country_code', 'string'),\n", - " ('image', 'struct'),\n", - " ('invitee_count', 'bigint'),\n", - " ('inviter_id', 'string'),\n", - " ('lender_id', 'string'),\n", - " ('loan_because', 'string'),\n", - " ('loan_count', 'bigint'),\n", - " ('member_since', 'string'),\n", - " ('name', 'string'),\n", - " ('occupation', 'string'),\n", - " ('occupational_info', 'string'),\n", - " ('personal_url', 'string'),\n", - " ('uid', 'string'),\n", - " ('whereabouts', 'string')]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lenders.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[('id', 'bigint'), ('lender_ids', 'array')]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "loans_lenders.dtypes" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/Kiva Exploration.ipynb b/Kiva Exploration.ipynb new file mode 100644 index 0000000..7013b7f --- /dev/null +++ b/Kiva Exploration.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Understanding the Kiva Dataset\n", + "\n", + "Before we actually get into the work of predicting anything based on the data Kiva makes public, we first want to get a better picture of what the dataset actually looks like.\n", + "\n", + "Our first step: What is the schema of the data? Spark SQL will make it easy to query data in the future, but we need to know first what is available." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sparkSql = (SparkSession.builder\n", + " .master(\"local\")\n", + " .appName(\"Kiva Exploration\")\n", + " .getOrCreate())\n", + "\n", + "loans = sparkSql.read.format('json').load('kiva-data/loans.json')\n", + "lenders = sparkSql.read.format('json').load('kiva-data/lenders.json')\n", + "loans_lenders = sparkSql.read.format('json').load('kiva-data/loans_lenders.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- activity: string (nullable = true)\n", + " |-- basket_amount: long (nullable = true)\n", + " |-- bonus_credit_eligibility: boolean (nullable = true)\n", + " |-- borrowers: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- first_name: string (nullable = true)\n", + " | | |-- gender: string (nullable = true)\n", + " | | |-- last_name: string (nullable = true)\n", + " | | |-- pictured: boolean (nullable = true)\n", + " |-- currency_exchange_loss_amount: double (nullable = true)\n", + " |-- delinquent: boolean (nullable = true)\n", + " |-- description: struct (nullable = true)\n", + " | |-- languages: array (nullable = true)\n", + " | | |-- element: string (containsNull = true)\n", + " | |-- texts: struct (nullable = true)\n", + " | | |-- ar: string (nullable = true)\n", + " | | |-- en: string (nullable = true)\n", + " | | |-- es: string (nullable = true)\n", + " | | |-- fr: string (nullable = true)\n", + " | | |-- id: string (nullable = true)\n", + " | | |-- mn: string (nullable = true)\n", + " | | |-- pt: string (nullable = true)\n", + " | | |-- ru: string (nullable = true)\n", + " | | |-- vi: string (nullable = true)\n", + " |-- funded_amount: long (nullable = true)\n", + " |-- funded_date: string (nullable = true)\n", + " |-- id: long (nullable = true)\n", + " |-- image: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- template_id: long (nullable = true)\n", + " |-- journal_totals: struct (nullable = true)\n", + " | |-- bulkEntries: long (nullable = true)\n", + " | |-- entries: long (nullable = true)\n", + " |-- lender_count: long (nullable = true)\n", + " |-- loan_amount: long (nullable = true)\n", + " |-- location: struct (nullable = true)\n", + " | |-- country: string (nullable = true)\n", + " | |-- country_code: string (nullable = true)\n", + " | |-- geo: struct (nullable = true)\n", + " | | |-- level: string (nullable = true)\n", + " | | |-- pairs: string (nullable = true)\n", + " | | |-- type: string (nullable = true)\n", + " | |-- town: string (nullable = true)\n", + " |-- name: string (nullable = true)\n", + " |-- paid_amount: double (nullable = true)\n", + " |-- paid_date: string (nullable = true)\n", + " |-- partner_id: long (nullable = true)\n", + " |-- payments: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- amount: double (nullable = true)\n", + " | | |-- currency_exchange_loss_amount: double (nullable = true)\n", + " | | |-- local_amount: double (nullable = true)\n", + " | | |-- payment_id: long (nullable = true)\n", + " | | |-- processed_date: string (nullable = true)\n", + " | | |-- rounded_local_amount: double (nullable = true)\n", + " | | |-- settlement_date: string (nullable = true)\n", + " |-- planned_expiration_date: string (nullable = true)\n", + " |-- posted_date: string (nullable = true)\n", + " |-- sector: string (nullable = true)\n", + " |-- status: string (nullable = true)\n", + " |-- tags: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- name: string (nullable = true)\n", + " |-- terms: struct (nullable = true)\n", + " | |-- disbursal_amount: double (nullable = true)\n", + " | |-- disbursal_currency: string (nullable = true)\n", + " | |-- disbursal_date: string (nullable = true)\n", + " | |-- loan_amount: long (nullable = true)\n", + " | |-- local_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " | |-- loss_liability: struct (nullable = true)\n", + " | | |-- currency_exchange: string (nullable = true)\n", + " | | |-- currency_exchange_coverage_rate: double (nullable = true)\n", + " | | |-- nonpayment: string (nullable = true)\n", + " | |-- repayment_interval: string (nullable = true)\n", + " | |-- repayment_term: long (nullable = true)\n", + " | |-- scheduled_payments: array (nullable = true)\n", + " | | |-- element: struct (containsNull = true)\n", + " | | | |-- amount: double (nullable = true)\n", + " | | | |-- due_date: string (nullable = true)\n", + " |-- themes: array (nullable = true)\n", + " | |-- element: string (containsNull = true)\n", + " |-- translator: struct (nullable = true)\n", + " | |-- byline: string (nullable = true)\n", + " | |-- image: long (nullable = true)\n", + " |-- use: string (nullable = true)\n", + " |-- video: struct (nullable = true)\n", + " | |-- id: long (nullable = true)\n", + " | |-- thumbnailImageId: long (nullable = true)\n", + " | |-- title: string (nullable = true)\n", + " | |-- youtubeId: string (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "loans.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status=u'refunded', count=5504),\n", + " Row(status=u'defaulted', count=21776),\n", + " Row(status=u'in_repayment', count=155749),\n", + " Row(status=u'reviewed', count=3),\n", + " Row(status=u'deleted', count=2721),\n", + " Row(status=u'paid', count=775330),\n", + " Row(status=u'issue', count=199),\n", + " Row(status=u'inactive_expired', count=12421),\n", + " Row(status=u'fundraising', count=3986),\n", + " Row(status=u'expired', count=33773),\n", + " Row(status=u'inactive', count=2493),\n", + " Row(status=u'funded', count=173),\n", + " Row(status=u'', count=2)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.groupby(loans.status).count().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(delinquent=None, count=970465), Row(delinquent=True, count=43665)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.groupby(loans.delinquent).count().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status=u'refunded', count=156),\n", + " Row(status=u'defaulted', count=20116),\n", + " Row(status=u'in_repayment', count=23393)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.where(loans.delinquent == True).groupby(loans.status).count().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Row(status=u'in_repayment')]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loans.registerTempTable('loans')\n", + "sparkSql.sql('''\n", + "SELECT loans.status\n", + "FROM loans\n", + "LIMIT 1\n", + "''').collect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}