mirror of
https://github.com/bspeice/betterwithdata_cleaning_4
synced 2026-02-27 05:20:18 -05:00
Add emergency and outpatient data
Using the new scraper!
This commit is contained in:
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
211
exploration/MEPS Scraping - Python.ipynb
Normal file
211
exploration/MEPS Scraping - Python.ipynb
Normal file
@ -0,0 +1,211 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from html.parser import HTMLParser\n",
|
||||
"from IPython.display import display\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"emergency_data_url = \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n",
|
||||
"\n",
|
||||
"emergency_page = requests.get(emergency_data_url).text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['HC-160E',\n",
|
||||
" 'HC-152E',\n",
|
||||
" 'HC-144E',\n",
|
||||
" 'HC-135E',\n",
|
||||
" 'HC-126E',\n",
|
||||
" 'HC-118E',\n",
|
||||
" 'HC-110E',\n",
|
||||
" 'HC-102E',\n",
|
||||
" 'HC-094E',\n",
|
||||
" 'HC-085E',\n",
|
||||
" 'HC-077E',\n",
|
||||
" 'HC-067E',\n",
|
||||
" 'HC-059E',\n",
|
||||
" 'HC-051E',\n",
|
||||
" 'HC-033E',\n",
|
||||
" 'HC-026E',\n",
|
||||
" 'HC-016E',\n",
|
||||
" 'HC-010E']"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"class MepsScraper(HTMLParser):\n",
|
||||
" p = re.compile('HC-[0-9]*[A-Z]?')\n",
|
||||
" pufs = []\n",
|
||||
" \n",
|
||||
" def handle_data(self, data):\n",
|
||||
" \n",
|
||||
" if self.p.match(data):\n",
|
||||
" self.pufs.append(data)\n",
|
||||
" \n",
|
||||
"meps = MepsScraper()\n",
|
||||
"meps.feed(emergency_page)\n",
|
||||
"display(meps.pufs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['h160e',\n",
|
||||
" 'h152e',\n",
|
||||
" 'h144e',\n",
|
||||
" 'h135e',\n",
|
||||
" 'h126e',\n",
|
||||
" 'h118e',\n",
|
||||
" 'h110e',\n",
|
||||
" 'h102e',\n",
|
||||
" 'h94e',\n",
|
||||
" 'h85e',\n",
|
||||
" 'h77e',\n",
|
||||
" 'h67e',\n",
|
||||
" 'h59e',\n",
|
||||
" 'h51e',\n",
|
||||
" 'h33e',\n",
|
||||
" 'h26e',\n",
|
||||
" 'h16e',\n",
|
||||
" 'h10e']"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def norm_puf(puf):\n",
|
||||
" splits = puf.split(\"C-\")\n",
|
||||
" if splits[1][0] == '0':\n",
|
||||
" return ''.join([splits[0], splits[1][1:]]).lower()\n",
|
||||
" else:\n",
|
||||
" return ''.join(splits).lower()\n",
|
||||
" \n",
|
||||
"final_pufs = list(map(norm_puf, meps.pufs))\n",
|
||||
"display(final_pufs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"puf_url = lambda x: 'http://meps.ahrq.gov/mepsweb/data_files/pufs/' + x + 'ssp.zip'\n",
|
||||
"\n",
|
||||
"puf_urls = list(map(puf_url, final_pufs))\n",
|
||||
"\n",
|
||||
"puf_files = {puf: requests.get(puf_url(puf)) for puf in final_pufs}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Error extracting h26e'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Error extracting h16e'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Error extracting h10e'"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from zipfile import ZipFile, BadZipFile\n",
|
||||
"from io import BytesIO\n",
|
||||
"\n",
|
||||
"for key, value in puf_files.items():\n",
|
||||
" try:\n",
|
||||
" puf_zip = ZipFile(BytesIO(value.content))\n",
|
||||
" puf_zip.extractall()\n",
|
||||
" except BadZipFile:\n",
|
||||
" display(\"Error extracting {}\".format(key))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
87
exploration/MEPS Scraping.ipynb
Normal file
87
exploration/MEPS Scraping.ipynb
Normal file
@ -0,0 +1,87 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 60,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"library(httr)\n",
|
||||
"library(foreign)\n",
|
||||
" \n",
|
||||
"download_puf <- function(short_puf) {\n",
|
||||
" puf_base <- \"http://meps.ahrq.gov/mepsweb/data_files/pufs/\"\n",
|
||||
" puf_suffix <- \"ssp.zip\"\n",
|
||||
" \n",
|
||||
" zip_filename <- paste0(short_puf, \"ssp.zip\")\n",
|
||||
" filename <- paste0(short_puf, \".ssp\")\n",
|
||||
" puf_url <- paste0(puf_base, zip_filename)\n",
|
||||
" download.file(puf_url, zip_filename)\n",
|
||||
" \n",
|
||||
" # unzip\n",
|
||||
" unzip(zip_filename, files = filename)\n",
|
||||
" saveName <- paste0(short_puf, \".csv\")\n",
|
||||
"\n",
|
||||
" # read sas file and return as csv file\n",
|
||||
" mydata <- read.xport(filename)\n",
|
||||
" write.table(mydata, file = saveName, sep = \",\")\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"normalize_puf <- function(full_puf) {\n",
|
||||
" stage_1 <- gsub(\"C-0\", \"\", full_puf)\n",
|
||||
" stage_2 <- gsub(\"C-\", \"\", stage_1)\n",
|
||||
" \n",
|
||||
" tolower(stage_2)\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ERROR",
|
||||
"evalue": "Error in lookup.xport(file): file not in SAS transfer format\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"Error in lookup.xport(file): file not in SAS transfer format\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Emergency Visit PUF's\n",
|
||||
"# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
|
||||
"\n",
|
||||
"# Outpatient Visits\n",
|
||||
"pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
|
||||
"\n",
|
||||
"puf_downloads = c()\n",
|
||||
"for (puf in pufs) {\n",
|
||||
" download_puf(normalize_puf(puf))\n",
|
||||
"}"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "R",
|
||||
"language": "R",
|
||||
"name": "ir"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": "r",
|
||||
"file_extension": ".r",
|
||||
"mimetype": "text/x-r-source",
|
||||
"name": "R",
|
||||
"pygments_lexer": "r",
|
||||
"version": "3.2.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
BIN
exploration/h102f.ssp
Normal file
BIN
exploration/h102f.ssp
Normal file
Binary file not shown.
BIN
exploration/h102fssp.zip
Normal file
BIN
exploration/h102fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h110f.ssp
Normal file
BIN
exploration/h110f.ssp
Normal file
Binary file not shown.
BIN
exploration/h110fssp.zip
Normal file
BIN
exploration/h110fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h118f.ssp
Normal file
BIN
exploration/h118f.ssp
Normal file
Binary file not shown.
BIN
exploration/h118fssp.zip
Normal file
BIN
exploration/h118fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h126f.ssp
Normal file
BIN
exploration/h126f.ssp
Normal file
Binary file not shown.
BIN
exploration/h126fssp.zip
Normal file
BIN
exploration/h126fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h135f.ssp
Normal file
BIN
exploration/h135f.ssp
Normal file
Binary file not shown.
BIN
exploration/h135fssp.zip
Normal file
BIN
exploration/h135fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h144f.ssp
Normal file
BIN
exploration/h144f.ssp
Normal file
Binary file not shown.
BIN
exploration/h144fssp.zip
Normal file
BIN
exploration/h144fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h152f.ssp
Normal file
BIN
exploration/h152f.ssp
Normal file
Binary file not shown.
BIN
exploration/h152fssp.zip
Normal file
BIN
exploration/h152fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h159 .ssp
Normal file
BIN
exploration/h159 .ssp
Normal file
Binary file not shown.
BIN
exploration/h159.ssp
Normal file
BIN
exploration/h159.ssp
Normal file
Binary file not shown.
BIN
exploration/h160f.ssp
Normal file
BIN
exploration/h160f.ssp
Normal file
Binary file not shown.
BIN
exploration/h160fssp.zip
Normal file
BIN
exploration/h160fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h67e.ssp
Normal file
BIN
exploration/h67e.ssp
Normal file
Binary file not shown.
BIN
exploration/h67f.ssp
Normal file
BIN
exploration/h67f.ssp
Normal file
Binary file not shown.
BIN
exploration/h67fssp.zip
Normal file
BIN
exploration/h67fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h77f.ssp
Normal file
BIN
exploration/h77f.ssp
Normal file
Binary file not shown.
BIN
exploration/h77fssp.zip
Normal file
BIN
exploration/h77fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h85f.ssp
Normal file
BIN
exploration/h85f.ssp
Normal file
Binary file not shown.
BIN
exploration/h85fssp.zip
Normal file
BIN
exploration/h85fssp.zip
Normal file
Binary file not shown.
BIN
exploration/h94f.ssp
Normal file
BIN
exploration/h94f.ssp
Normal file
Binary file not shown.
BIN
exploration/h94fssp.zip
Normal file
BIN
exploration/h94fssp.zip
Normal file
Binary file not shown.
Reference in New Issue
Block a user