In this coding example, I downloaded public housing auction data from the Taiwanese Department of the Interior to build a predictive model of auction prices. The data also incoporated using web-scraping and google API’s to calculate the distance to the nearest subway station in Kaohsiung city.

knitr::opts_chunk$set(eval=FALSE)
library(tidyverse)

Using Downloaded data from https://pip.moi.gov.tw/V2/A/SCRA0601.aspx

library(readxl)
auction_download<- read_excel("~/Desktop/auction.xls")

Clean housing data set

temp<-auction_download
#Creating "shares" of the building in auction and then categorize for structures of the apartment building or a stand-alone house
temp<-temp%>%
  separate(權利範圍,c("分母","分子"),"分之")%>%
  mutate("權利範圍"=ifelse(is.na(分子),1,as.numeric(分子)/as.numeric(分母)))%>%
  select(-c(分母,分子))%>%
  mutate(建物面積=as.numeric(建物面積), 總拍定價格=as.numeric(總拍定價格), 拍別=as.numeric(拍別))%>%
  separate(層數及構造,c("層數","構造"),"第")%>%
  mutate(層數=str_extract(層數,"\\d+"),構造=str_extract(構造,"\\d+"))%>%
  mutate(層數=as.numeric(層數),構造=as.numeric(構造))%>%
  mutate(層數=ifelse(層數>100&層數<200,as.numeric(str_replace(as.character(層數),"10","1")),ifelse(層數%in%c(210,310,410,510,610,710,810,910),round(層數/100)*10,ifelse(層數>2000,as.numeric(str_replace(as.character(層數),"10","")),層數))))%>%
  mutate(構造=ifelse(構造>100&構造<200,as.numeric(str_replace(as.character(構造),"10","1")),ifelse(構造%in%c(210,310,410,510,610,710,810,910),round(構造/100)*10,ifelse(構造>2000,as.numeric(str_replace(as.character(構造),"10","")),構造))))%>%
  mutate(鄉鎮市區=as.factor(鄉鎮市區))%>%
  #Creating year, month, date data from the original data set.
  mutate(年=round(拍定日期/10000),月=round((拍定日期-年*10000)/100),日=拍定日期-年*10000-月*100)

#Using regular expression to match single address
temp<-temp%>%
  mutate(構造=ifelse(is.na(構造),0,構造),層數=ifelse(is.na(層數),1,層數),透天=ifelse(構造==0,ifelse(層數<7,1,0),0))%>%
  mutate(address=str_extract_all(房屋地址,pattern=regex("高雄市.+(路|街|巷)\\d+號")))%>%
  unnest(address)

Prepare for translating addresses to longitude and latitude to calculate distance from subway.

library(ggmap)
#I took out the API_key for the purpose of this coding example
api_key <-"API_Key"
register_google(key = api_key)
#geoList is needed to capture erroneous addresses
geoLoc<-function(address){
  tryCatch(
  geo_df<-geocode(address)
  )
  if(is.na(geo_df$lat)){
  lat<-999
  lng<-999
  }else{
  lat<-geo_df$lat
  lng<-geo_df$lon
  }
  return(list(lat=lat,lng=lng,address=address)) 
  #Added for the query restrictions of google API
  # Sys.sleep(0.01)
}

Adding subway data from https://www.krtc.com.tw/Guide/station_guide

library(rvest)
met_link<-read_html("https://www.krtc.com.tw/Guide/station_guide")
stat_link<-met_link%>%html_nodes('.stationList ul li a')%>%html_attr("href")%>%data.frame()%>%as_tibble()%>%dplyr::rename(link=".")%>%mutate(link=paste("https://www.krtc.com.tw",link,sep=""))

get_station_address<-function(link){
station_link<-read_html(as.character(link))
address<-station_link%>%html_nodes(xpath='//*[@id="articleBox"]/div[2]/div[3]/ul/li[2]/text()')%>%html_text()%>%str_extract_all(regex("高雄.+號"))
return(address)
}

station_geo<-stat_link%>%
  rowwise()%>%
  mutate(address=get_station_address(link))%>%
  mutate(address=unlist(address))%>%
  mutate_geocode(address)%>%
  mutate(num=1)

Get geolocations of houses

house_geo<-map_dfr(unique(temp$address)[1:10],geoLoc)
house_geo_backup<-house-geo
library(geosphere)
houses<-inner_join(temp,house_geo,by="address")%>%
  dplyr::rename(lat_h=lat,lng_h=lng)%>%
  group_by(address)%>%
  dplyr::mutate(house_num=1:n(),num=1)%>%
  ungroup()%>%
  #matching each address to all stations
  full_join(station_geo,by="num")%>%
  select(-num)%>%
  rowwise()%>%
  dplyr::mutate(dist_from_station=distGeo(c(lng_h,lat_h),c(lng_s,lat_s))/1000)%>%
  group_by(house_num)%>%
  filter(dist_from_station==min(dist_from_station))%>%
  mutate(notNearStation=ifelse(dist_from_station>=0.5,1,0))

Simple ‘Linear’ Models

model<-lm(log(總拍定價格)~log(建物面積+1)+log(層數+1)+log(層數+1):透天+透天+構造+dist_from_station+notNearStation+dist_from_station:notNearStation+as.factor(拍別)+(-log(權利範圍))+as.factor(點交)+as.factor(月)+as.factor(年)+as.factor(鄉鎮市區),data=filter(houses,建物面積<500))

summary(model)

house<-data.frame(鄉鎮市區=c("三民"),建物面積=c(60),層數=c(25),構造=c(21),拍別=c(2),點交=c("點交"),月=c(11),權利範圍=1,透天=0,年=108,dist_from_station=0.6,notNearStation=1)

exp(predict(model,house))