I feel a need for (internet) speed..
by Robert Del Vicario I've been in the market to buy a house lately as I think it would be nice for the dog to have a back yard to run around in. One of my considerations in purchasing a home is internet speed. As a general principle I want the fastest internet possible assuming that it is not outrageously expensive, and as luck would have it Cincinnati Bell is now offering 1Gbps internet speeds for ~$70. However, Cincinnati Bell does not offer a coverage map which makes it a gigantic hassle to determine which houses do and do not have 1Gbps internet speeds. Additionally, the coverage seems quite poor. Given my obsessive compulsive nature I decided to make my own coverage map. To create my own coverage map I scraped all of the home listings that meet my home buying criteria (2 or more bathrooms, $100k-$300k, single family or multi-family home) in the Cincinnati area off of realtor.com and then ran them through Cincinnati Bell's internet speed lookup tool. The process is annoyingly slow, and the imperfect nature of my scripting means that I drop many observations. However, I do think that the coverage map provides a general sense regarding 1Gbps coverage in Cincinnati.
require(data.table) require(stringr) require(XML) require(rvest) require(RSelenium) require(ggmap) setwd("C:/Users/rdelvicario/Dropbox/Realtor Scraper") trunk_url <- "http://www.realtor.com/realestateandhomes-search/Cincinnati_OH/beds-2/baths-2/type-single-family-home,multi-family-home/price-100000-300000/show-hide-pending/pg-" branch_url <- "?ml=4" extract_text <- function(i trunk_url branch_url){ try({ #extract data from the McKinsey address <- data.table() page <- html(paste0(trunk_url i branch_url)) x <- page %>% html_nodes(".listing-location .ellipsis span") %>% html_text() for(z in 1:10){ street_number <- str_sub(x[1 + (z - 1) * 4] start = 1 end = str_locate(x[1 + (z - 1) * 4] " ")-1) street <- str_sub(x[1 + (z- 1) * 4] start = str_locate(x[1 + (z - 1) * 4] " ")+1 str_length(x[1 + (z - 1) * 4])) city <- x[2 + (z - 1) * 4] state <- x[3 + (z - 1) * 4] zip <- x[4 + (z - 1) * 4] df_temp <- data.table(street_number street city state zip) address <- rbind(df_temp address fill = T) } address <- address[!duplicated(address) ] return(address) } silent = T) } #address collection addresses <- data.table() for(i in 1:429){ df_temp <- extract_text(i trunk_url branch_url) addresses <- rbind(addresses df_temp) flush.console() print(i) } #clear obviously bad values addresses <- addresses[!duplicated(addresses) ] addresses$street_number <- as.numeric(addresses$street_number) addresses$zip <- as.numeric(addresses$zip) addresses <- addresses[complete.cases(addresses) ] write.csv(address "150420 realtor addresses.csv" row.names = F) #Start Cincinnati Bell web scrape #tutorial http://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html #css selector is: div:nth-child(3) .blue RSelenium::startServer() # remDr <- remoteDriver(remoteServerAddr = "localhost" # , port = 4444 # , browserName = 'firefox') eCap <- list(phantomjs.binary.path = "C:/Users/rdelvicario/Desktop/phantomjs.exe") remDr <- remoteDriver(browserName = "phantomjs" extraCapabilities = eCap) extractSpeed <- function(addresses i wait_time = 5){ tryCatch({ #create new instance of web browser remDr$open(silent = T) #navigate to cincinnati bell wbsite remDr$navigate("http://www.cincinnatibell.com/internet/") #remove landing page webElem <- remDr$findElement(using = "id" "x-region") webElem$clickElement() #find and click the check avaliability link webElem <- remDr$findElement(using = "css selector" "p > a") # webElem$highlightElement() # to visually check what elemnet is selected webElem$clickElement() #sleep for a few seconds Sys.sleep(5) #input address data webElem <- remDr$findElement(using = "id" "street_num") webElem$sendKeysToElement(list(as.character(addresses$street_number[i]))) webElem <- remDr$findElement(using = "id" "street_name") webElem$sendKeysToElement(list(addresses$street[i])) webElem <- remDr$findElement(using = "id" "zip") webElem$sendKeysToElement(list(as.character(addresses$zip[i]))) #click check address button webElem <- remDr$findElement(using = "css selector" "img.b_check_address.float_right") webElem$clickElement() #sleep for a few seconds Sys.sleep(wait_time) #extract speed webElem <- remDr$findElement(using = "css selector" "div:nth-child(3) .blue") out <- webElem$getElementText()[[1]] #close browser session remDr$close() #return value out } error = function(e){ remDr$close() NA }) } #test extractSpeed(addresses 3 wait_time = 20) #loop through all the addresses to determine internet speed addresses$speed <- NA for(i in 1:nrow(addresses)){ addresses$speed[i] <- extractSpeed(addresses i 10) flush.console() print(addresses$speed[i]) flush.console() print(i) } # write.csv(addresses, "data locations.csv", row.names = F) #create addresses addresses$address <- paste0(addresses$street_number " " addresses$street ", " addresses$city ", " addresses$state " " addresses$zip) #remove locations with no data addresses_2 <- addresses addresses_2 <- addresses_2[!is.na(addresses_2$speed) ] #get lattitude and longitude from google latlon <- t(sapply(addresses_2$addressgeocode USE.NAMES=F)) latlon <- as.data.frame(latlon) #attach addresses to data.frame addresses_2 <- cbind(addresses_2 latlon) addresses_2$lon <- as.numeric(addresses_2$lon) addresses_2$lat <- as.numeric(addresses_2$lat) #write out data write.csv(addresses_2 "data locations with lattitude and longitude.csv" row.names = F)
Created by Pretty R at inside-R.org















