Add code to unpack the Kiva data for Spark

pull/1/head
bspeice 2016-10-21 14:38:00 -04:00
förälder ab3c48853c
incheckning 71627e160e
4 ändrade filer med 47 tillägg och 18 borttagningar

1
.gitignore vendored
Visa fil

@ -47,3 +47,4 @@ fabric.properties
# Don't include the full snapshot ZIP since it's massive.
kiva_ds_json.zip
*.json

Visa fil

@ -17,23 +17,7 @@
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_3" default="true" assert-keyword="false" jdk-15="false">
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_3" default="false" assert-keyword="false" jdk-15="false" project-jdk-name="Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe)" project-jdk-type="Python SDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="masterDetails">
<states>
<state key="ProjectJDKs.UI">
<settings>
<last-edited>Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe)</last-edited>
<splitter-proportions>
<option name="proportions">
<list>
<option value="0.2" />
</list>
</option>
</splitter-proportions>
</settings>
</state>
</states>
</component>
</project>

Visa fil

@ -7,8 +7,11 @@
</component>
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/kiva-data" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe) interpreter library" level="application" />
</component>
</module>

41
unpack_kiva.py Normal file
Visa fil

@ -0,0 +1,41 @@
import zipfile
import json
import os
kiva_folder = 'kiva-data/'
def mkdirs():
os.mkdir(kiva_folder)
for f in map(lambda x: kiva_folder + x,
['loans', 'lenders', 'loans_lenders']):
if not os.path.isdir(f):
os.mkdir(f)
def reformat_json(json_obj):
return json.dumps(json_obj, sort_keys=True, separators=(',', ':'))
def unpack_kiva(filename="kiva_ds_json.zip"):
if not zipfile.is_zipfile(filename):
raise TypeError("Unable to unpack zip - Corrupted file?")
z = zipfile.ZipFile(filename)
names = z.namelist()
for json_name in filter(lambda x: 'json' in x, names):
try:
json_file = z.open(json_name)
json_string = json_file.read().decode('utf8')
json_obj = json.loads(json_string)
# Get `loan`, `lender`, etc.
obj_type = json_name.split('/')[0]
json_content = json_obj[obj_type]
formatted = [reformat_json(j) for j in json_content]
with open(kiva_folder + json_name, 'w+') as output:
output.write('\n'.join(formatted))
except json.JSONDecodeError:
print("Error decoding file {}".format(json_name))
if __name__ == '__main__':
mkdirs()
unpack_kiva()