mirror of
https://github.com/bspeice/betterwithdata_cleaning_4
synced 2024-12-04 21:28:09 -05:00
MEPS scraping example
Add a full example of how to scrape the MEPS site
This commit is contained in:
parent
06653d8951
commit
8f0c6cef15
@ -118,23 +118,142 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"ename": "ERROR",
|
"ename": "ERROR",
|
||||||
"evalue": "Error in eval(expr, envir, enclos): object 'pufs' not found\n",
|
"evalue": "Error in eval(expr, envir, enclos): object 'w' not found\n",
|
||||||
"output_type": "error",
|
"output_type": "error",
|
||||||
"traceback": [
|
"traceback": [
|
||||||
"Error in eval(expr, envir, enclos): object 'pufs' not found\n"
|
"Error in eval(expr, envir, enclos): object 'w' not found\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Loading required package: xml2\n",
|
||||||
|
"\n",
|
||||||
|
"Attaching package: 'XML'\n",
|
||||||
|
"\n",
|
||||||
|
"The following object is masked from 'package:rvest':\n",
|
||||||
|
"\n",
|
||||||
|
" xml\n",
|
||||||
|
"\n",
|
||||||
|
"Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
|
||||||
|
"In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "ERROR",
|
||||||
|
"evalue": "Error in lookup.xport(file): file not in SAS transfer format\n",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"Error in lookup.xport(file): file not in SAS transfer format\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# Example usage: Download data by PUF\n",
|
||||||
|
"#\n",
|
||||||
"# Emergency Visit PUF's\n",
|
"# Emergency Visit PUF's\n",
|
||||||
"# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
|
"# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Outpatient Visits\n",
|
"# Outpatient Visits\n",
|
||||||
"# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
|
"# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
|
||||||
"\n",
|
"\n",
|
||||||
"puf_downloads = c()\n",
|
"# puf_downloads = c()\n",
|
||||||
"for (puf in pufs) {\n",
|
"# for (puf in pufs) {\n",
|
||||||
" download_puf(normalize_puf(puf))\n",
|
"# download_puf(normalize_puf(puf))\n",
|
||||||
|
"# }\n",
|
||||||
|
"\n",
|
||||||
|
"########\n",
|
||||||
|
"\n",
|
||||||
|
"# Example usage: Scrape the MEPS web page for all PUF ids\n",
|
||||||
|
"# Feel free to modify the URL being used - Any of the links on [this page](http://meps.ahrq.gov/mepsweb/data_stats/download_data_files.jsp)\n",
|
||||||
|
"# are fair game.\n",
|
||||||
|
"\n",
|
||||||
|
"#install.packages(\"rvest\")\n",
|
||||||
|
"library(rvest)\n",
|
||||||
|
"html <-\n",
|
||||||
|
" read_html(\n",
|
||||||
|
" \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"library(XML)\n",
|
||||||
|
"\n",
|
||||||
|
"# to scrap all the file name from web and form html links\n",
|
||||||
|
"doc <- htmlParse(html, asText = TRUE)\n",
|
||||||
|
"plain.text <- xpathSApply(doc, \"//a\", xmlValue)\n",
|
||||||
|
"data <- c()\n",
|
||||||
|
"for (i in 1:length(plain.text))\n",
|
||||||
|
"{\n",
|
||||||
|
" if (substr(plain.text[i], 1,3) == \"HC-\")\n",
|
||||||
|
" {\n",
|
||||||
|
" data[i] <- tolower(plain.text[i])\n",
|
||||||
|
" }\n",
|
||||||
|
" originalData <- data[!is.na(data)]\n",
|
||||||
|
" \n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"mydata <- c()\n",
|
||||||
|
"for (i in 1:length(originalData))\n",
|
||||||
|
"{\n",
|
||||||
|
" mydata[i] <- gsub(\"c-0\", \"\", originalData[i])\n",
|
||||||
|
" mydata[i] <- gsub(\"c-\", \"\", mydata[i])\n",
|
||||||
|
" \n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"# to scrap all .ssp data file \n",
|
||||||
|
"for (i in 1:length(mydata))\n",
|
||||||
|
"{\n",
|
||||||
|
" download_puf(mydata[i])\n",
|
||||||
"}"
|
"}"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user