In this coding example, I downloaded public housing auction data from the Taiwanese Department of the Interior to build a predictive model of auction prices. The data also incoporated using web-scraping and google API’s to calculate the distance to the nearest subway station in Kaohsiung city.
knitr::opts_chunk$set(eval=FALSE)
library(tidyverse)
Using Downloaded data from https://pip.moi.gov.tw/V2/A/SCRA0601.aspx
library(readxl)
auction_download<- read_excel("~/Desktop/auction.xls")
Clean housing data set
temp<-auction_download
#Creating "shares" of the building in auction and then categorize for structures of the apartment building or a stand-alone house
temp<-temp%>%
separate(權利範圍,c("分母","分子"),"分之")%>%
mutate("權利範圍"=ifelse(is.na(分子),1,as.numeric(分子)/as.numeric(分母)))%>%
select(-c(分母,分子))%>%
mutate(建物面積=as.numeric(建物面積), 總拍定價格=as.numeric(總拍定價格), 拍別=as.numeric(拍別))%>%
separate(層數及構造,c("層數","構造"),"第")%>%
mutate(層數=str_extract(層數,"\\d+"),構造=str_extract(構造,"\\d+"))%>%
mutate(層數=as.numeric(層數),構造=as.numeric(構造))%>%
mutate(層數=ifelse(層數>100&層數<200,as.numeric(str_replace(as.character(層數),"10","1")),ifelse(層數%in%c(210,310,410,510,610,710,810,910),round(層數/100)*10,ifelse(層數>2000,as.numeric(str_replace(as.character(層數),"10","")),層數))))%>%
mutate(構造=ifelse(構造>100&構造<200,as.numeric(str_replace(as.character(構造),"10","1")),ifelse(構造%in%c(210,310,410,510,610,710,810,910),round(構造/100)*10,ifelse(構造>2000,as.numeric(str_replace(as.character(構造),"10","")),構造))))%>%
mutate(鄉鎮市區=as.factor(鄉鎮市區))%>%
#Creating year, month, date data from the original data set.
mutate(年=round(拍定日期/10000),月=round((拍定日期-年*10000)/100),日=拍定日期-年*10000-月*100)
#Using regular expression to match single address
temp<-temp%>%
mutate(構造=ifelse(is.na(構造),0,構造),層數=ifelse(is.na(層數),1,層數),透天=ifelse(構造==0,ifelse(層數<7,1,0),0))%>%
mutate(address=str_extract_all(房屋地址,pattern=regex("高雄市.+(路|街|巷)\\d+號")))%>%
unnest(address)
Prepare for translating addresses to longitude and latitude to calculate distance from subway.
library(ggmap)
#I took out the API_key for the purpose of this coding example
api_key <-"API_Key"
register_google(key = api_key)
#geoList is needed to capture erroneous addresses
geoLoc<-function(address){
tryCatch(
geo_df<-geocode(address)
)
if(is.na(geo_df$lat)){
lat<-999
lng<-999
}else{
lat<-geo_df$lat
lng<-geo_df$lon
}
return(list(lat=lat,lng=lng,address=address))
#Added for the query restrictions of google API
# Sys.sleep(0.01)
}
Adding subway data from https://www.krtc.com.tw/Guide/station_guide
library(rvest)
met_link<-read_html("https://www.krtc.com.tw/Guide/station_guide")
stat_link<-met_link%>%html_nodes('.stationList ul li a')%>%html_attr("href")%>%data.frame()%>%as_tibble()%>%dplyr::rename(link=".")%>%mutate(link=paste("https://www.krtc.com.tw",link,sep=""))
get_station_address<-function(link){
station_link<-read_html(as.character(link))
address<-station_link%>%html_nodes(xpath='//*[@id="articleBox"]/div[2]/div[3]/ul/li[2]/text()')%>%html_text()%>%str_extract_all(regex("高雄.+號"))
return(address)
}
station_geo<-stat_link%>%
rowwise()%>%
mutate(address=get_station_address(link))%>%
mutate(address=unlist(address))%>%
mutate_geocode(address)%>%
mutate(num=1)
Get geolocations of houses
house_geo<-map_dfr(unique(temp$address)[1:10],geoLoc)
house_geo_backup<-house-geo
library(geosphere)
houses<-inner_join(temp,house_geo,by="address")%>%
dplyr::rename(lat_h=lat,lng_h=lng)%>%
group_by(address)%>%
dplyr::mutate(house_num=1:n(),num=1)%>%
ungroup()%>%
#matching each address to all stations
full_join(station_geo,by="num")%>%
select(-num)%>%
rowwise()%>%
dplyr::mutate(dist_from_station=distGeo(c(lng_h,lat_h),c(lng_s,lat_s))/1000)%>%
group_by(house_num)%>%
filter(dist_from_station==min(dist_from_station))%>%
mutate(notNearStation=ifelse(dist_from_station>=0.5,1,0))
Simple ‘Linear’ Models
model<-lm(log(總拍定價格)~log(建物面積+1)+log(層數+1)+log(層數+1):透天+透天+構造+dist_from_station+notNearStation+dist_from_station:notNearStation+as.factor(拍別)+(-log(權利範圍))+as.factor(點交)+as.factor(月)+as.factor(年)+as.factor(鄉鎮市區),data=filter(houses,建物面積<500))
summary(model)
house<-data.frame(鄉鎮市區=c("三民"),建物面積=c(60),層數=c(25),構造=c(21),拍別=c(2),點交=c("點交"),月=c(11),權利範圍=1,透天=0,年=108,dist_from_station=0.6,notNearStation=1)
exp(predict(model,house))