{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import requests\n", "from html.parser import HTMLParser\n", "from IPython.display import display\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "emergency_data_url = \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n", "\n", "emergency_page = requests.get(emergency_data_url).text" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['HC-160E',\n", " 'HC-152E',\n", " 'HC-144E',\n", " 'HC-135E',\n", " 'HC-126E',\n", " 'HC-118E',\n", " 'HC-110E',\n", " 'HC-102E',\n", " 'HC-094E',\n", " 'HC-085E',\n", " 'HC-077E',\n", " 'HC-067E',\n", " 'HC-059E',\n", " 'HC-051E',\n", " 'HC-033E',\n", " 'HC-026E',\n", " 'HC-016E',\n", " 'HC-010E']" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "class MepsScraper(HTMLParser):\n", " p = re.compile('HC-[0-9]*[A-Z]?')\n", " pufs = []\n", " \n", " def handle_data(self, data):\n", " \n", " if self.p.match(data):\n", " self.pufs.append(data)\n", " \n", "meps = MepsScraper()\n", "meps.feed(emergency_page)\n", "display(meps.pufs)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['h160e',\n", " 'h152e',\n", " 'h144e',\n", " 'h135e',\n", " 'h126e',\n", " 'h118e',\n", " 'h110e',\n", " 'h102e',\n", " 'h94e',\n", " 'h85e',\n", " 'h77e',\n", " 'h67e',\n", " 'h59e',\n", " 'h51e',\n", " 'h33e',\n", " 'h26e',\n", " 'h16e',\n", " 'h10e']" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def norm_puf(puf):\n", " splits = puf.split(\"C-\")\n", " if splits[1][0] == '0':\n", " return ''.join([splits[0], splits[1][1:]]).lower()\n", " else:\n", " return ''.join(splits).lower()\n", " \n", "final_pufs = list(map(norm_puf, meps.pufs))\n", "display(final_pufs)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "puf_url = lambda x: 'http://meps.ahrq.gov/mepsweb/data_files/pufs/' + x + 'ssp.zip'\n", "\n", "puf_urls = list(map(puf_url, final_pufs))\n", "\n", "puf_files = {puf: requests.get(puf_url(puf)) for puf in final_pufs}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'Error extracting h26e'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Error extracting h16e'" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'Error extracting h10e'" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from zipfile import ZipFile, BadZipFile\n", "from io import BytesIO\n", "\n", "for key, value in puf_files.items():\n", " try:\n", " puf_zip = ZipFile(BytesIO(value.content))\n", " puf_zip.extractall()\n", " except BadZipFile:\n", " display(\"Error extracting {}\".format(key))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.0" } }, "nbformat": 4, "nbformat_minor": 0 }