betterwithdata_cleaning_4/exploration/MEPS Scraping - Python.ipynb

212 lines
4.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"from html.parser import HTMLParser\n",
"from IPython.display import display\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"emergency_data_url = \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n",
"\n",
"emergency_page = requests.get(emergency_data_url).text"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['HC-160E',\n",
" 'HC-152E',\n",
" 'HC-144E',\n",
" 'HC-135E',\n",
" 'HC-126E',\n",
" 'HC-118E',\n",
" 'HC-110E',\n",
" 'HC-102E',\n",
" 'HC-094E',\n",
" 'HC-085E',\n",
" 'HC-077E',\n",
" 'HC-067E',\n",
" 'HC-059E',\n",
" 'HC-051E',\n",
" 'HC-033E',\n",
" 'HC-026E',\n",
" 'HC-016E',\n",
" 'HC-010E']"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"class MepsScraper(HTMLParser):\n",
" p = re.compile('HC-[0-9]*[A-Z]?')\n",
" pufs = []\n",
" \n",
" def handle_data(self, data):\n",
" \n",
" if self.p.match(data):\n",
" self.pufs.append(data)\n",
" \n",
"meps = MepsScraper()\n",
"meps.feed(emergency_page)\n",
"display(meps.pufs)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['h160e',\n",
" 'h152e',\n",
" 'h144e',\n",
" 'h135e',\n",
" 'h126e',\n",
" 'h118e',\n",
" 'h110e',\n",
" 'h102e',\n",
" 'h94e',\n",
" 'h85e',\n",
" 'h77e',\n",
" 'h67e',\n",
" 'h59e',\n",
" 'h51e',\n",
" 'h33e',\n",
" 'h26e',\n",
" 'h16e',\n",
" 'h10e']"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def norm_puf(puf):\n",
" splits = puf.split(\"C-\")\n",
" if splits[1][0] == '0':\n",
" return ''.join([splits[0], splits[1][1:]]).lower()\n",
" else:\n",
" return ''.join(splits).lower()\n",
" \n",
"final_pufs = list(map(norm_puf, meps.pufs))\n",
"display(final_pufs)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"puf_url = lambda x: 'http://meps.ahrq.gov/mepsweb/data_files/pufs/' + x + 'ssp.zip'\n",
"\n",
"puf_urls = list(map(puf_url, final_pufs))\n",
"\n",
"puf_files = {puf: requests.get(puf_url(puf)) for puf in final_pufs}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'Error extracting h26e'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Error extracting h16e'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Error extracting h10e'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from zipfile import ZipFile, BadZipFile\n",
"from io import BytesIO\n",
"\n",
"for key, value in puf_files.items():\n",
" try:\n",
" puf_zip = ZipFile(BytesIO(value.content))\n",
" puf_zip.extractall()\n",
" except BadZipFile:\n",
" display(\"Error extracting {}\".format(key))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}