MEPS scraping example

Add a full example of how to scrape the MEPS site
This commit is contained in:
bspeice 2015-11-07 18:21:29 -05:00
parent 06653d8951
commit 8f0c6cef15

View File

@ -118,23 +118,142 @@
"outputs": [
{
"ename": "ERROR",
"evalue": "Error in eval(expr, envir, enclos): object 'pufs' not found\n",
"evalue": "Error in eval(expr, envir, enclos): object 'w' not found\n",
"output_type": "error",
"traceback": [
"Error in eval(expr, envir, enclos): object 'pufs' not found\n"
"Error in eval(expr, envir, enclos): object 'w' not found\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading required package: xml2\n",
"\n",
"Attaching package: 'XML'\n",
"\n",
"The following object is masked from 'package:rvest':\n",
"\n",
" xml\n",
"\n",
"Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'"
]
},
{
"ename": "ERROR",
"evalue": "Error in lookup.xport(file): file not in SAS transfer format\n",
"output_type": "error",
"traceback": [
"Error in lookup.xport(file): file not in SAS transfer format\n"
]
}
],
"source": [
"# Example usage: Download data by PUF\n",
"#\n",
"# Emergency Visit PUF's\n",
"# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
"\n",
"# Outpatient Visits\n",
"# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
"\n",
"puf_downloads = c()\n",
"for (puf in pufs) {\n",
" download_puf(normalize_puf(puf))\n",
"# puf_downloads = c()\n",
"# for (puf in pufs) {\n",
"# download_puf(normalize_puf(puf))\n",
"# }\n",
"\n",
"########\n",
"\n",
"# Example usage: Scrape the MEPS web page for all PUF ids\n",
"# Feel free to modify the URL being used - Any of the links on [this page](http://meps.ahrq.gov/mepsweb/data_stats/download_data_files.jsp)\n",
"# are fair game.\n",
"\n",
"#install.packages(\"rvest\")\n",
"library(rvest)\n",
"html <-\n",
" read_html(\n",
" \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n",
" )\n",
"\n",
"library(XML)\n",
"\n",
"# to scrap all the file name from web and form html links\n",
"doc <- htmlParse(html, asText = TRUE)\n",
"plain.text <- xpathSApply(doc, \"//a\", xmlValue)\n",
"data <- c()\n",
"for (i in 1:length(plain.text))\n",
"{\n",
" if (substr(plain.text[i], 1,3) == \"HC-\")\n",
" {\n",
" data[i] <- tolower(plain.text[i])\n",
" }\n",
" originalData <- data[!is.na(data)]\n",
" \n",
"}\n",
"\n",
"mydata <- c()\n",
"for (i in 1:length(originalData))\n",
"{\n",
" mydata[i] <- gsub(\"c-0\", \"\", originalData[i])\n",
" mydata[i] <- gsub(\"c-\", \"\", mydata[i])\n",
" \n",
"}\n",
"\n",
"# to scrap all .ssp data file \n",
"for (i in 1:length(mydata))\n",
"{\n",
" download_puf(mydata[i])\n",
"}"
]
}