Browse Source

Add code to unpack the Kiva data for Spark

pull/1/head
bspeice 4 years ago
parent
commit
71627e160e
4 changed files with 47 additions and 18 deletions
  1. +1
    -0
      .gitignore
  2. +1
    -17
      .idea/misc.xml
  3. +4
    -1
      kiva-dig.iml
  4. +41
    -0
      unpack_kiva.py

+ 1
- 0
.gitignore View File

@@ -47,3 +47,4 @@ fabric.properties

# Don't include the full snapshot ZIP since it's massive.
kiva_ds_json.zip
*.json

+ 1
- 17
.idea/misc.xml View File

@@ -17,23 +17,7 @@
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_3" default="true" assert-keyword="false" jdk-15="false">
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_3" default="false" assert-keyword="false" jdk-15="false" project-jdk-name="Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe)" project-jdk-type="Python SDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="masterDetails">
<states>
<state key="ProjectJDKs.UI">
<settings>
<last-edited>Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe)</last-edited>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>

+ 4
- 1
kiva-dig.iml View File

@@ -7,8 +7,11 @@
</component>
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/kiva-data" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe) interpreter library" level="application" />
</component>
</module>

+ 41
- 0
unpack_kiva.py View File

@@ -0,0 +1,41 @@
import zipfile
import json
import os

kiva_folder = 'kiva-data/'

def mkdirs():
os.mkdir(kiva_folder)
for f in map(lambda x: kiva_folder + x,
['loans', 'lenders', 'loans_lenders']):
if not os.path.isdir(f):
os.mkdir(f)


def reformat_json(json_obj):
return json.dumps(json_obj, sort_keys=True, separators=(',', ':'))


def unpack_kiva(filename="kiva_ds_json.zip"):
if not zipfile.is_zipfile(filename):
raise TypeError("Unable to unpack zip - Corrupted file?")

z = zipfile.ZipFile(filename)
names = z.namelist()
for json_name in filter(lambda x: 'json' in x, names):
try:
json_file = z.open(json_name)
json_string = json_file.read().decode('utf8')
json_obj = json.loads(json_string)
# Get `loan`, `lender`, etc.
obj_type = json_name.split('/')[0]
json_content = json_obj[obj_type]
formatted = [reformat_json(j) for j in json_content]
with open(kiva_folder + json_name, 'w+') as output:
output.write('\n'.join(formatted))
except json.JSONDecodeError:
print("Error decoding file {}".format(json_name))

if __name__ == '__main__':
mkdirs()
unpack_kiva()

Loading…
Cancel
Save