mirror of
				https://github.com/bspeice/betterwithdata_cleaning_4
				synced 2025-11-04 02:10:46 -05:00 
			
		
		
		
	MEPS scraping example
Add a full example of how to scrape the MEPS site
This commit is contained in:
		@ -118,23 +118,142 @@
 | 
				
			|||||||
   "outputs": [
 | 
					   "outputs": [
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
     "ename": "ERROR",
 | 
					     "ename": "ERROR",
 | 
				
			||||||
     "evalue": "Error in eval(expr, envir, enclos): object 'pufs' not found\n",
 | 
					     "evalue": "Error in eval(expr, envir, enclos): object 'w' not found\n",
 | 
				
			||||||
     "output_type": "error",
 | 
					     "output_type": "error",
 | 
				
			||||||
     "traceback": [
 | 
					     "traceback": [
 | 
				
			||||||
      "Error in eval(expr, envir, enclos): object 'pufs' not found\n"
 | 
					      "Error in eval(expr, envir, enclos): object 'w' not found\n"
 | 
				
			||||||
 | 
					     ]
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					     "name": "stderr",
 | 
				
			||||||
 | 
					     "output_type": "stream",
 | 
				
			||||||
 | 
					     "text": [
 | 
				
			||||||
 | 
					      "Loading required package: xml2\n",
 | 
				
			||||||
 | 
					      "\n",
 | 
				
			||||||
 | 
					      "Attaching package: 'XML'\n",
 | 
				
			||||||
 | 
					      "\n",
 | 
				
			||||||
 | 
					      "The following object is masked from 'package:rvest':\n",
 | 
				
			||||||
 | 
					      "\n",
 | 
				
			||||||
 | 
					      "    xml\n",
 | 
				
			||||||
 | 
					      "\n",
 | 
				
			||||||
 | 
					      "Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'Warning message:\n",
 | 
				
			||||||
 | 
					      "In is.na(data): is.na() applied to non-(list or vector) of type 'NULL'"
 | 
				
			||||||
 | 
					     ]
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					     "ename": "ERROR",
 | 
				
			||||||
 | 
					     "evalue": "Error in lookup.xport(file): file not in SAS transfer format\n",
 | 
				
			||||||
 | 
					     "output_type": "error",
 | 
				
			||||||
 | 
					     "traceback": [
 | 
				
			||||||
 | 
					      "Error in lookup.xport(file): file not in SAS transfer format\n"
 | 
				
			||||||
     ]
 | 
					     ]
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
   ],
 | 
					   ],
 | 
				
			||||||
   "source": [
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "# Example usage: Download data by PUF\n",
 | 
				
			||||||
 | 
					    "#\n",
 | 
				
			||||||
    "# Emergency Visit PUF's\n",
 | 
					    "# Emergency Visit PUF's\n",
 | 
				
			||||||
    "# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
 | 
					    "# pufs <- c('HC-160E','HC-152E','HC-144E','HC-135E','HC-126E','HC-118E','HC-110E','HC-102E','HC-094E','HC-085E','HC-077E','HC-067E','HC-059E','HC-051E','HC-033E','HC-026E','HC-016E','HC-010E')\n",
 | 
				
			||||||
    "\n",
 | 
					    "\n",
 | 
				
			||||||
    "# Outpatient Visits\n",
 | 
					    "# Outpatient Visits\n",
 | 
				
			||||||
    "# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
 | 
					    "# pufs <- c('HC-160F','HC-152F','HC-144F','HC-135F','HC-126F','HC-118F','HC-110F','HC-102F','HC-094F','HC-085F','HC-077F','HC-067F','HC-059F','HC-051F','HC-033F','HC-026F','HC-016F','HC-010F')\n",
 | 
				
			||||||
    "\n",
 | 
					    "\n",
 | 
				
			||||||
    "puf_downloads = c()\n",
 | 
					    "# puf_downloads = c()\n",
 | 
				
			||||||
    "for (puf in pufs) {\n",
 | 
					    "# for (puf in pufs) {\n",
 | 
				
			||||||
    "    download_puf(normalize_puf(puf))\n",
 | 
					    "#    download_puf(normalize_puf(puf))\n",
 | 
				
			||||||
 | 
					    "# }\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "########\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "# Example usage: Scrape the MEPS web page for all PUF ids\n",
 | 
				
			||||||
 | 
					    "# Feel free to modify the URL being used - Any of the links on [this page](http://meps.ahrq.gov/mepsweb/data_stats/download_data_files.jsp)\n",
 | 
				
			||||||
 | 
					    "# are fair game.\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "#install.packages(\"rvest\")\n",
 | 
				
			||||||
 | 
					    "library(rvest)\n",
 | 
				
			||||||
 | 
					    "html <-\n",
 | 
				
			||||||
 | 
					    "  read_html(\n",
 | 
				
			||||||
 | 
					    "    \"http://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits\"\n",
 | 
				
			||||||
 | 
					    "  )\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "library(XML)\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "# to scrap all the file name from web and form html links\n",
 | 
				
			||||||
 | 
					    "doc <-  htmlParse(html, asText = TRUE)\n",
 | 
				
			||||||
 | 
					    "plain.text <- xpathSApply(doc, \"//a\", xmlValue)\n",
 | 
				
			||||||
 | 
					    "data <- c()\n",
 | 
				
			||||||
 | 
					    "for (i in 1:length(plain.text))\n",
 | 
				
			||||||
 | 
					    "{\n",
 | 
				
			||||||
 | 
					    "  if (substr(plain.text[i], 1,3) == \"HC-\")\n",
 | 
				
			||||||
 | 
					    "  {\n",
 | 
				
			||||||
 | 
					    "    data[i] <- tolower(plain.text[i])\n",
 | 
				
			||||||
 | 
					    "  }\n",
 | 
				
			||||||
 | 
					    "  originalData <- data[!is.na(data)]\n",
 | 
				
			||||||
 | 
					    "  \n",
 | 
				
			||||||
 | 
					    "}\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "mydata <- c()\n",
 | 
				
			||||||
 | 
					    "for (i in 1:length(originalData))\n",
 | 
				
			||||||
 | 
					    "{\n",
 | 
				
			||||||
 | 
					    "  mydata[i] <- gsub(\"c-0\", \"\", originalData[i])\n",
 | 
				
			||||||
 | 
					    "  mydata[i] <- gsub(\"c-\", \"\", mydata[i])\n",
 | 
				
			||||||
 | 
					    "  \n",
 | 
				
			||||||
 | 
					    "}\n",
 | 
				
			||||||
 | 
					    "\n",
 | 
				
			||||||
 | 
					    "# to scrap all .ssp data file \n",
 | 
				
			||||||
 | 
					    "for (i in 1:length(mydata))\n",
 | 
				
			||||||
 | 
					    "{\n",
 | 
				
			||||||
 | 
					    "  download_puf(mydata[i])\n",
 | 
				
			||||||
    "}"
 | 
					    "}"
 | 
				
			||||||
   ]
 | 
					   ]
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user