diff --git a/exploration/MEPS Scraping.ipynb b/exploration/MEPS Scraping.ipynb index b26c075..c224ab5 100644 --- a/exploration/MEPS Scraping.ipynb +++ b/exploration/MEPS Scraping.ipynb @@ -118,23 +118,142 @@ "outputs": [ { "ename": "ERROR", - "evalue": "Error in eval(expr, envir, enclos): object 'pufs' not found\n", + "evalue": "Error in eval(expr, envir, enclos): object 'w' not found\n", "output_type": "error", "traceback": [ - "Error in eval(expr, envir, enclos): object 'pufs' not found\n" + "Error in eval(expr, envir, enclos): object 'w' not found\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading required package: xml2\n", + "\n", + "Attaching package: 'XML'\n", + "\n", + "The following object is masked from 'package:rvest':\n", + "\n", + " xml\n", + "\n", + "Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n", + "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'" + ] + }, + { + "ename": "ERROR", + "evalue": "Error in lookup.xport(file): file not in SAS transfer format\n", + "output_type": "error", + "traceback": [ + "Error in lookup.xport(file): file not in SAS transfer format\n" ] } ], "source": [ + "# Example usage: Download data by PUF\n", + "#\n", "# Emergency Visit PUF's\n", "# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n", "\n", "# Outpatient Visits\n", "# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n", "\n", - "puf_downloads = c()\n", - "for (puf in pufs) {\n", - " download_puf(normalize_puf(puf))\n", + "# puf_downloads = c()\n", + "# for (puf in pufs) {\n", + "# download_puf(normalize_puf(puf))\n", + "# }\n", + "\n", + "########\n", + "\n", + "# Example usage: Scrape the MEPS web page for all PUF ids\n", + "# Feel free to modify the URL being used - Any of the links on [this page](http://meps.ahrq.gov/mepsweb/data_stats/download_data_files.jsp)\n", + "# are fair game.\n", + "\n", + "#install.packages(\"rvest\")\n", + "library(rvest)\n", + "html <-\n", + " read_html(\n", + " \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n", + " )\n", + "\n", + "library(XML)\n", + "\n", + "# to scrap all the file name from web and form html links\n", + "doc <- htmlParse(html, asText = TRUE)\n", + "plain.text <- xpathSApply(doc, \"//a\", xmlValue)\n", + "data <- c()\n", + "for (i in 1:length(plain.text))\n", + "{\n", + " if (substr(plain.text[i], 1,3) == \"HC-\")\n", + " {\n", + " data[i] <- tolower(plain.text[i])\n", + " }\n", + " originalData <- data[!is.na(data)]\n", + " \n", + "}\n", + "\n", + "mydata <- c()\n", + "for (i in 1:length(originalData))\n", + "{\n", + " mydata[i] <- gsub(\"c-0\", \"\", originalData[i])\n", + " mydata[i] <- gsub(\"c-\", \"\", mydata[i])\n", + " \n", + "}\n", + "\n", + "# to scrap all .ssp data file \n", + "for (i in 1:length(mydata))\n", + "{\n", + " download_puf(mydata[i])\n", "}" ] }