diff --git a/.gitignore b/.gitignore index 4a1abea..04b7ce2 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,4 @@ fabric.properties # Don't include the full snapshot ZIP since it's massive. kiva_ds_json.zip +*.json \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index e212a38..56993a3 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -17,23 +17,7 @@ - + - - - - - Python 3.5.1 (C:\Users\Bradlee Speice\Anaconda3\python.exe) - - - - - - - \ No newline at end of file diff --git a/kiva-dig.iml b/kiva-dig.iml index 9468285..78c0c16 100644 --- a/kiva-dig.iml +++ b/kiva-dig.iml @@ -7,8 +7,11 @@ - + + + + \ No newline at end of file diff --git a/unpack_kiva.py b/unpack_kiva.py new file mode 100644 index 0000000..a213e06 --- /dev/null +++ b/unpack_kiva.py @@ -0,0 +1,41 @@ +import zipfile +import json +import os + +kiva_folder = 'kiva-data/' + +def mkdirs(): + os.mkdir(kiva_folder) + for f in map(lambda x: kiva_folder + x, + ['loans', 'lenders', 'loans_lenders']): + if not os.path.isdir(f): + os.mkdir(f) + + +def reformat_json(json_obj): + return json.dumps(json_obj, sort_keys=True, separators=(',', ':')) + + +def unpack_kiva(filename="kiva_ds_json.zip"): + if not zipfile.is_zipfile(filename): + raise TypeError("Unable to unpack zip - Corrupted file?") + + z = zipfile.ZipFile(filename) + names = z.namelist() + for json_name in filter(lambda x: 'json' in x, names): + try: + json_file = z.open(json_name) + json_string = json_file.read().decode('utf8') + json_obj = json.loads(json_string) + # Get `loan`, `lender`, etc. + obj_type = json_name.split('/')[0] + json_content = json_obj[obj_type] + formatted = [reformat_json(j) for j in json_content] + with open(kiva_folder + json_name, 'w+') as output: + output.write('\n'.join(formatted)) + except json.JSONDecodeError: + print("Error decoding file {}".format(json_name)) + +if __name__ == '__main__': + mkdirs() + unpack_kiva()