mirror of
https://github.com/bspeice/betterwithdata_cleaning_4
synced 2024-12-03 20:58:11 -05:00
MEPS scraping example
Add a full example of how to scrape the MEPS site
This commit is contained in:
parent
06653d8951
commit
8f0c6cef15
@ -118,23 +118,142 @@
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ERROR",
|
||||
"evalue": "Error in eval(expr, envir, enclos): object 'pufs' not found\n",
|
||||
"evalue": "Error in eval(expr, envir, enclos): object 'w' not found\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"Error in eval(expr, envir, enclos): object 'pufs' not found\n"
|
||||
"Error in eval(expr, envir, enclos): object 'w' not found\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading required package: xml2\n",
|
||||
"\n",
|
||||
"Attaching package: 'XML'\n",
|
||||
"\n",
|
||||
"The following object is masked from 'package:rvest':\n",
|
||||
"\n",
|
||||
" xml\n",
|
||||
"\n",
|
||||
"Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "ERROR",
|
||||
"evalue": "Error in lookup.xport(file): file not in SAS transfer format\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"Error in lookup.xport(file): file not in SAS transfer format\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Example usage: Download data by PUF\n",
|
||||
"#\n",
|
||||
"# Emergency Visit PUF's\n",
|
||||
"# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
|
||||
"\n",
|
||||
"# Outpatient Visits\n",
|
||||
"# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
|
||||
"\n",
|
||||
"puf_downloads = c()\n",
|
||||
"for (puf in pufs) {\n",
|
||||
" download_puf(normalize_puf(puf))\n",
|
||||
"# puf_downloads = c()\n",
|
||||
"# for (puf in pufs) {\n",
|
||||
"# download_puf(normalize_puf(puf))\n",
|
||||
"# }\n",
|
||||
"\n",
|
||||
"########\n",
|
||||
"\n",
|
||||
"# Example usage: Scrape the MEPS web page for all PUF ids\n",
|
||||
"# Feel free to modify the URL being used - Any of the links on [this page](http://meps.ahrq.gov/mepsweb/data_stats/download_data_files.jsp)\n",
|
||||
"# are fair game.\n",
|
||||
"\n",
|
||||
"#install.packages(\"rvest\")\n",
|
||||
"library(rvest)\n",
|
||||
"html <-\n",
|
||||
" read_html(\n",
|
||||
" \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"library(XML)\n",
|
||||
"\n",
|
||||
"# to scrap all the file name from web and form html links\n",
|
||||
"doc <- htmlParse(html, asText = TRUE)\n",
|
||||
"plain.text <- xpathSApply(doc, \"//a\", xmlValue)\n",
|
||||
"data <- c()\n",
|
||||
"for (i in 1:length(plain.text))\n",
|
||||
"{\n",
|
||||
" if (substr(plain.text[i], 1,3) == \"HC-\")\n",
|
||||
" {\n",
|
||||
" data[i] <- tolower(plain.text[i])\n",
|
||||
" }\n",
|
||||
" originalData <- data[!is.na(data)]\n",
|
||||
" \n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"mydata <- c()\n",
|
||||
"for (i in 1:length(originalData))\n",
|
||||
"{\n",
|
||||
" mydata[i] <- gsub(\"c-0\", \"\", originalData[i])\n",
|
||||
" mydata[i] <- gsub(\"c-\", \"\", mydata[i])\n",
|
||||
" \n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# to scrap all .ssp data file \n",
|
||||
"for (i in 1:length(mydata))\n",
|
||||
"{\n",
|
||||
" download_puf(mydata[i])\n",
|
||||
"}"
|
||||
]
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user