Return to MUSA 801 Projects Page

The project was created in association with the MUSA 801 Practicum at the University of Pennsylvania, taught by Ken Steif, Karl Dailey, and Michael Fichman. We would like to thank Matthew Harris for providing guidance and support to help us develop the model. All products from this class should be considered proofs of concept and works in progress.

This document starts from the Introduction section, then dived into the data exploration and feature engineering. After preparing the concepts and the data, we come into the model building and model validation sections.

9. Appendix

9.1 Load libraries

#LOAD REQUISISTE LIBRARIES #data wrangling library(tidyverse) library(dbplyr) library(plyr) library(mefa4) library(stringr) library(lubridate) library(sf) library(geosphere) library(tmap) #plots library(ggplot2) library(viridisLite) library(gganimate) #API library(tidycensus) #census library(osmdata) #OSM library(rwunderground) #darksky library(darksky) #darksky library(curl) #darksky library(gtools) #darksky #modeling library(Boruta) library(caret) library(mlbench) library(prophet) library(randomForest) library(lme4)

9.2 Create hexagonal grid cells for main roads and highways

#CREATE MAP OF HEXAGONAL GRID CELLS (FISHNET) #GET CENSUS DATA #cache census shapefile downloads options(tigris_use_cache = TRUE) #insert your API key here census_api_key("") #get jefferson county shapefile from census data jeffersonCounty <- get_acs(state = "KY", geography = "county", variables = "B19013_001", geometry = TRUE) %>% filter(NAME == 'Jefferson County, Kentucky') #set the coordinate reference system for the county shapefile - UTM 16N NAD83 meters jeffersonCounty <- jeffersonCounty %>% st_transform(., crs = 32616) #GET OSM BOUNDING BOX #set bounding box for louisville - refer osm website for coordinates (https://www.openstreetmap.org/export#map=10/38.1889/-85.6761) osm_cityBB <- opq(bbox = c(-86.4899, 37.8564, -84.8611, 38.5192)) #GET OSM STREET DATA #get OSM road data - select main roads and highways osm_roads <- add_osm_feature(opq = osm_cityBB, key = "highway", value = c("motorway", "motorway_link","primary", "primary_link", "tertiary", "trunk", "tertiary_link", "trunk_link", "secondary", "secondary_link")) #convert osm street dataframe to a sf object osm_roads_sf <- osmdata_sf(osm_roads) #get geometry for osm roads sf (line geometry) osm_roads_lines <- st_geometry(osm_roads_sf$osm_lines) #set the coordinate reference system - UTM 16N NAD83 meters osm_roads_lines <- osm_roads_lines %>% st_sf() %>% st_as_sf(., coords = c("longitude", "latitude"), crs = 4326) %>% st_transform(., crs = 32616) %>% st_sf() #clip osm roads to jefferson county OSM_road_clipped <- osm_roads_lines[jeffersonCounty, ] #remove extraneous dataframes to free up memory space rm(osm_roads) rm(osm_roads_sf) rm(osm_roads_lines) #CREATE FISHNET GRID CELLS #set fishnet grid cell size (in meters) fishnet_grid_dim <- 500 #create fishnet - hexagon shaped grid cells fishnet <- st_make_grid(st_as_sfc(st_bbox(st_buffer(OSM_road_clipped, 100))), cellsize = fishnet_grid_dim, square = FALSE) %>% st_sf() %>% mutate(fishnet_id = row_number(), f_area_m2 = as.numeric(st_area(.))) %>% filter(lengths(st_intersects(., OSM_road_clipped)) > 0)

9.3 Spatial join grid cells with traffic jams data sampled from Waze database

#SPATIAL JOIN JAMS DATA WITH FISHNET/GRID CELLS #load jams data sampled from Waze database jams <- read.csv("jams_bigQ.csv") #convert jams dataframe to a sf objecr jams_sf <- jams %>% st_as_sf(., coords = c("longitude", "latitude"), crs = 4326) %>% st_transform(., crs = 32616) %>% st_sf() #spatial join fishnet and jams data jamsFishnet <- st_intersection(fishnet, jams_sf)

9.4 Extract date/time for the jams data

#EXTRACT DATE/TIME INFORMATION #NOTE: PLEASE ENSURE THAT THE JAMS DATA SAMPLED FROM WAZE DATABASE IS AT LOCAL TIME AND ADJUSTED FOR TIME DIFFERENCE BETWEEN 'UTC' AND LOCAL TIME #create date time object and add columns for date, month, day, weekday jamsFishnet <- jamsFishnet %>% mutate(dateTimeObj = ymd_hms(jam$pub_utc_date, tz = "America/Louisville"), date = format.Date(date(dateTimeObj)), #date (character) month = month(dateTimeObj), #month (numeric) day = day(dateTimeObj), #day (numeric) weekday = weekdays(dateTimeObj), #weekday (character) hour = hour(dateTimeObj)) #hour (numeric)

9.5 Jam directions

#JAMS DIRECTION #load jams data sampled from Waze database jams <- read.csv("jams_bigQ.csv") #get traffic jam directions jamsFishnet_dir <- jamsFishnet %>% dplyr::select(jam_id, uuid, pub_utc_date, speed, delay, length, road_type, level, blocking_alert_id, order) %>% group_by(jam_id) %>% mutate(min_order = min(order), max_order = max(order)) %>% dplyr::select(-order) %>% distinct() %>% ungroup() jamsFishnet_dir2 <- jamsFishnet_dir %>% inner_join(., dplyr::select(jamsFishnet, jam_id, "start_lat" = latitude, "start_long" = longitude, order) %>% group_by(jam_id) %>% filter(order == min(order)), by = "jam_id") %>% # min order not always 1 dplyr::select(-order, -pub_utc_date, -uuid, -speed, -delay, -length, -road_type, -level, -blocking_alert_id) %>% # clean-up inner_join(., dplyr::select(jamsFishnet_dir, jam_id, "end_lat" = latitude, "end_long" = longitude, order) %>% group_by(jam_id) %>% filter(order == max(order)), by = "jam_id") %>% ungroup() jamsFishnet_dir3 <- jamsFishnet_dir2 %>% group_by(jam_id) %>% mutate( geom = st_as_text(st_linestring(matrix(c(start_long, start_lat,end_long, end_lat), nrow=2, ncol=2, byrow = TRUE))), line_dir = geosphere::bearing(c(start_long,start_lat),c(end_long,end_lat)), line_dir = if_else(line_dir < 0, 360-abs(line_dir), line_dir) ) %>% ### HOW TO BREAK THIS FREE!? text? st_as_sf(wkt = "geom", crs = 4326) %>% st_transform(., crs = 32616) # UTM 16N NAD83 meters jamsFishnet_dir4 <- jamsFishnet_dir3 %>% mutate(ew_direction = case_when(end_long < start_long ~ "w", end_long > start_long ~ "e"), ns_direction = case_when(end_lat < start_lat ~ "s", end_long > start_long ~ "n"), line_dir = case_when(line_dir > 0 & line_dir <= 50 ~ "north bound", line_dir > 50 & line_dir <= 140 ~ "east bound", line_dir > 140 & line_dir <= 220 ~ "south bound", line_dir > 220 & line_dir <= 310 ~ "west bound", line_dir > 310 & line_dir <= 360 ~ "north bound")) %>% na.omit() #Jam direction to jam lines jamsFishnet_dir4_df <- jamsFishnet_dir4 %>% st_set_geometry(NULL) class(jamsFishnet_dir4_df) Jam_Q_dir <- jamsFishnet %>% dplyr::select(-order) %>% left_join(., jamsFishnet_dir4_df, by = "jam_id") jamsFishnetDir <- Jam_Q_dir %>% dplyr::select(jam_id, pub_utc_date, speed, delay, name, length, level, latitude, longitude, line_dir) %>% group_by(jam_id, pub_utc_date, speed, delay, name, length, level, line_dir) %>% ### Group must include all non-coord columns tidyr::nest() %>% mutate(data = map(data, as.matrix), ### have tp REVERSE COL ORDER and use drop=FLASE for single point lines geom = map(data, function(.x) st_linestring(.x[,2:1,drop=FALSE])), geom = map_chr(geom, st_as_text)) %>% ### HOW TO BREAK THIS FREE!? text? dplyr::select(-data) %>% st_as_sf(wkt = "geom", crs = 4326) %>% st_transform(., crs = 32616) %>% # UTM 16N NAD83 meters na.omit()

9.6 Jams data imputation

#IMPUTE JAMS DATA df1 <- jamsFishnetDir %>% dplyr::select(fishnet_id, pub_utc_date,length,speed,delay,level,Direction,road_type) %>% mutate( pub_utc_date = ymd_hms(as.character(pub_utc_date)), local_time = with_tz(pub_utc_date,"UTC"), Direction = as.character(ifelse(is.na(Direction),"east bound",as.character(Direction))), D_north = if_else(Direction=='north bound',1,0), D_south=if_else(Direction=='south bound',1,0), D_east=if_else(Direction=='east bound',1,0), D_west=if_else(Direction=="west bound",1,0), freeway = as.factor(if_else(road_type==3,1,0)) ) %>% mutate(year = year(local_time), month = month(local_time), day = day(local_time), weekday = weekdays(local_time), hour = hour(local_time), yday = yday(local_time), yhour = ((yday-1)*23)+hour) %>% dplyr::select( -pub_utc_date, -speed, -delay, -yday, -yhour,-road_type) #create dataframe of dates df <- df1 %>% dplyr::select(fishnet_id) %>% group_by(fishnet_id) %>% summarise(count=n()) %>% dplyr::select(-count) %>% split(.$fishnet_id) %>% map_df(~data_frame(fishnet_id = .$fishnet_id[1], local_time =seq(as.POSIXct("2018-01-01 00:00:00", tz ="UTC"), as.POSIXct("2018-12-31 23:00:00", tz="UTC"), by = "hour") )) %>% mutate(date=date(local_time)) #extracting dates in the 100 day Waze sample data x <- jamsFishnetDir %>% dplyr::select(pub_utc_date) %>% mutate(pub_utc_date = ymd_hms(as.character(pub_utc_date)), date =date(pub_utc_date)) %>% dplyr::select(date) %>% distinct() df <- df %>% filter(date %in% x$date) %>% dplyr::select(-date) #credits: https://stackoverflow.com/questions/40227494/how-to-fill-in-missing-dates-in-range-by-group #add jams data df_1_1 <- df1 %>% dplyr::select(-D_west, -D_north, -D_south, -D_east) %>% group_by(fishnet_id, year,month, day, hour) %>% mutate(count = n()) %>% summarise_if(is.numeric, mean) %>% ungroup() %>% mutate(local_time = as.Date(paste(year, month,day, sep='-')), #creating local_time variable to complete the join H = sprintf("%02d0000", hour), H = format(strptime(H, format="%H%M%S"), format = "%H:%M:%S"), local_time = paste(local_time,H, sep=" "), local_time = as.POSIXct(local_time,tz="UTC", fortmat= "%Y-%M-%D %H:%M:%S")) %>% dplyr::select(-H,-year,-month,-day,-hour) df_2_1 <- left_join(df,df_1_1, by=c("local_time","fishnet_id")) %>% replace(., is.na(.), 0) #final imputed dataset imputedDF <- df_2_1 %>% groupby(fishnet_id, month, day, hour) %>% summarize(length = mean(length))

9.7 Wrangle data from OpenStreetMap (OSM)

#OBTAIN DATA FROM OPEN STREET MAP (OSM) #CREATE BUFFER AROUND EACH FISHNET GRID CELL #set buffer area (in meters) buffer_area <- 200 #create a buffer aroud each fishnet grid cell fishnetBuffer <- st_buffer(fishnet, buffer_area) #CREATE LIST OF THE OSM FEATURES THAT WILL BE OBTAINED USING THE 'OSMDATA' R PACKAGE #list for OSM keys key_lst = list("highway", "highway", "highway", "highway", "highway", "highway", "barrier", "amenity", "building", "landuse", "landuse", "landuse", "landuse") #list for OSM values val_lst = list("turning_circle", "mini_roundabout", c("stop", "give_way"), "crossing", "traffic_signal", "motorway_junction", "toll_booth", c("parking", "parking_entrance", "parking_space"), "office", "industrial", "commercial", "residential", "retail") #list for column names that will be used for the OSM features joined to the fishnet dataframe col_lst = list("turning_ct", "roundabt_ct", "sign_ct", "cross_ct", "traffic_signal", "junction", "toll_ct") #list for column names that will be used for the OSM features joined to the fishnetBuffer dataframe col_lst_buffer = list("parking_ct", "off_ct", "ind_ct", "comm_ct", "res_ct", "ret_ct") #OBTAIN OSM FEATURES, AGGREGATE BY FISHNET GRID CELL AND JOIN TO FISHNET DATAFRAME #IN ORDER TO AVOID API QUERY TIME OUT, PLEASE RUN THIS CODE WHEN OSM API TRAFFIC IS LOW. OTHERWISE, RUN THE CODE FOR EACH OSM FEATURE SEPARATELY WITHOUT USING THE 'FOR' LOOP. #use 'for' loop to obtain and join OSM features #loop through the length of the column names list for features that will be joined to the fishnet for (i in 1:length(col_lst)) { #get key from key_list key_inp = key_lst[[i]] #get value from val_list val_inp = val_lst[[i]] #get osm data using the appropriate key and value osm_data <- add_osm_feature(opq = osm_cityBB, key=key_inp, value=val_inp) #convert osm dataframe to a sf object osm_data_sf <- osmdata_sf(osm_data) #get geometry for osm sf (point geometry) osm_data_pt <- st_geometry(osm_data_sf$osm_points) #set the coordinate reference system - UTM 16N NAD83 meters osm_data_pt <- osm_data_pt %>% st_sf() %>% st_as_sf(., coords = c("longitude", "latitude"), crs = 4326) %>% st_transform(., crs = 32616) #clip osm data to jefferson county osm_data_clipped <- osm_data_pt[jeffersonCounty, ] #aggregte OSM data to each fishnet grid cell osm_data_sum <- st_intersection(fishnet, osm_data_clipped) %>% group_by(fishnet_id) %>% summarise(count = n()) #drop geometry osm_data_sum_df <- osm_data_sum %>% st_set_geometry(NULL) #join aggregated OSM data back to fishnet fishnetOSM <- fishnet %>% left_join(., osm_data_sum_df, by = "fishnet_id") } #use 'for' loop to change column names for newly added OSM features #loop through the length of the column names list and start loop from 3rd column (first two columns are related to fishnet ID and fishnet area) for (i in 3:length(col_lst)) { #obtain column name for the OSM feature col_nam = col_lst[[i]] #change the name of the column names(fishnetOSM)[i]<-paste(col_nam) } #check column names names(fishnetOSM) #remove osm data frames and free up memory space rm(osm_data) rm(osm_data_clipped) rm(osm_data_pt) rm(osm_data_sf) rm(osm_data_sum) rm(osm_data_sum_df) #OBTAIN OSM FEATURES, AGGREGATE BY fishnetBuffer GRID CELL AND JOIN TO FISHNETBUFFER #DATAFRAME IN ORDER TO AVOID API QUERY TIME OUT, PLEASE RUN THIS CODE WHEN OSM API TRAFFIC #IS LOW. OTHERWISE, RUN THE CODE FOR EACH OSM FEATURE SEPARATELY WITHOUT USING THE 'FOR' #LOOP. #use 'for' loop to obtain and join OSM features #loop through the length of the column names list for features that will be joined to the fishnetBuffer for (i in 1:length(col_lst_buffer)) { #get key from key_list key_inp = key_lst[[i]] #get value from val_list val_inp = val_lst[[i]] #get osm data using the appropriate key and value osm_data <- add_osm_feature(opq = osm_cityBB, key=key_inp, value=val_inp) #convert osm dataframe to a sf object osm_data_sf <- osmdata_sf(osm_data) #get geometry for osm sf (point geometry) osm_data_pt <- st_geometry(osm_data_sf$osm_points) #set the coordinate reference system - UTM 16N NAD83 meters osm_data_pt <- osm_data_pt %>% st_sf() %>% st_as_sf(., coords = c("longitude", "latitude"), crs = 4326) %>% st_transform(., crs = 32616) #clip osm data to jefferson county osm_data_clipped <- osm_data_pt[jeffersonCounty, ] #aggregte OSM data to each fishnet grid cell osm_data_sum <- st_intersection(fishnetBuffer, osm_data_clipped) %>% group_by(fishnet_id) %>% summarise(count = n()) #drop geometry osm_data_sum_df <- osm_data_sum %>% st_set_geometry(NULL) #join aggregated OSM data back to fishnet fishnetBuffer <- fishnetBuffer %>% left_join(., osm_data_sum_df, by = "fishnet_id") } #use 'for' loop to change column names for newly added OSM features #loop through the length of the column names list and start loop from 3rd column (first two columns are related to fishnet ID and fishnet area) for (i in 3:length(col_lst_buffer)) { #obtain column name for the OSM feature col_nam = col_lst_buffer[[i]] #change the name of the column names(fishnetBuffer)[i]<-paste(col_nam) } #check column names names(fishnetBuffer) #remove osm data frames and free up memory space rm(osm_data) rm(osm_data_clipped) rm(osm_data_pt) rm(osm_data_sf) rm(osm_data_sum) rm(osm_data_sum_df) #OBTAIN AND ADD OSM BUILDING DATA (TOTAL NUMBER OF BUILDINGS) TO FISHNETBUFFER DATAFRAME #PLEASE NOTE THAT THE OSM BUILDING DATA IS A VERY LARGE FILE AND THAT IT TAKES AN EXTREMELY LONG TIME TO DOWNLOAD THE DATA USING THE OSM OVERPASS QUERY. THEREFORE, WE RECOMMEND DOWNLOADING THE OSM BUILDING DATA SHAPEFILE FROM THE OSM GEOFABRIK WEBSITE (https://download.geofabrik.de/north-america.html) AND LOADING THE DATASET AS A SHAPEFILE LOCALLY. #load building OSM shapefile using 'sf' package buildings_kentucky <- read_sf("kentuckyOSM/gis_osm_buildings_a_free_1.shp") #set the coordinate reference system - UTM 16N NAD83 meters buildings_kentucky <- buildings_kentucky %>% st_sf() %>% st_as_sf(., coords = c("longitude", "latitude"), crs = 4326) %>% st_transform(., crs = 32616) %>% st_sf() #clip OSM building data to Jefferson County building_clipped <- buildings_kentucky[jeffersonCounty, ] #aggregte OSM data to each fishnet grid cell building_clipped_sum <- st_intersection(fishnetBuffer, building_clipped) %>% group_by(fishnet_id) %>% summarise(count = n()) #drop geometry building_clipped_sum_df <- building_clipped_sum %>% st_set_geometry(NULL) #join aggregated building data back to fishnetBuffer fishnetBuffer <- fishnetBuffer %>% left_join(., building_clipped_sum_df, by = "fishnet_id") #get column number for the newly added OSM building data feature in the fishnetBuffer dataframe blng_col_num = ncol(fishnetBuffer) - 1 #change column name for the OSM building data feature names(fishnetBuffer)[blng_col_num]<-paste("tot_blng_cnt") #check column names in fishnetBuffer dataframe names(fishnetBuffer) #JOIN OSM FEATURES AGGREGATED TO FISHNETBUFFER TO FISHNET DATAFRAME #drop geometry for fishnetBuffer dataframe fishnetBuffer_df <- fishnetBuffer %>% st_set_geometry(NULL) #select relevant columns (i.e., only the OSM features) in the fishnetBuffer and join to fishnet by fishnet ID. fishnetOSM <- fishnetBuffer_df %>% dplyr::select(1, 3:blng_col_num) %>% left_join(., fishnetOSM, by = "fishnet_id") #convert all NA values to zero fishnetOSM[is.na(fishnetOSM)] <- 0

9.8 Wrangle weather data from Dark Sky API

#OBTAIN WEATHER DATA FROM DARK SKY API #LOAD API KEYS rwunderground::set_api_key("eab8742b2f09a617688fe43f3e1735db") alerts(set_location(territory = "Kentucky ", city = "Louisville")) darksky_api_key("eab8742b2f09a617688fe43f3e1735db") #CREATE A DATFRAME FOR ALL HOURS IN A DAY FOR THE WHOLE YEAR OF 2018 #create dataframe of hourly timestamps dateDF <- seq(as.POSIXct("2018-01-01 00:00:00", tz = "America/Louisville"), as.POSIXct("2018-12-31 23:00:00", tz = "America/Louisville"), by = "hour") %>% as.data.frame() #rename the timestamp column name names(dateDF)[1] <- paste("dateTimeObject") #create a column for dates in string/character format dateDF <- dateDF %>% mutate(dateString = format.Date(dateTimeObject)) #CREATE A DATAFRAME WITH THE REQUISITE COLUMNS FOR THE WEATHER DATA #make a request for one day to get the columns for weather data weatherRequest<-as.data.frame(get_forecast_for(38.201033, -85.651970, "2018-02-01T12:00:00") [["hourly"]]) #make a dataframe with weather columns weather <- head(weatherRequest,0) #GET THE WEATHER DATA FOR THE ENTIRE YEAR #use 'for' loop to loop though every day of the year for (i in seq(1, length(dateDF$dateString), by=24)) { #code to get the date/timestamp in the darksky API format #get the date from the timestamp dataframe date <- dateDF$dateString[i] #separate the date and time dateSeparated <- paste(date, sep=" ") #subset the date from the separated date string dateAPI <- stringi::stri_sub(dateSeparated, 1, 10) #subset the time from the separated date string timeAPI <- stringi::stri_sub(dateSeparated, 12, 19) #create a vector of the date and time strings dateVector <- c(dateAPI, timeAPI) #join the two elements of the date vector using "T" joinedAPI <- paste(dateVector, collapse="T") #get the daily weather by hour data and assign it to a dataframe weatherDaily <- as.data.frame(get_forecast_for(38.201033, -85.651970, joinedAPI) [["hourly"]]) #smartbind it to the weather dataframe created earlier weather <- smartbind(weather, weatherDaily) } #convert any NA values to zero weather[is.na(weather)] <- 0

9.9 Wrangle Waze accident alerts data

#OBTAIN WAZE ACCIDENT ALERTS DATA #load accident alerts data - same sample days as jams data accidents <- read.csv("alerts_bigQ.csv") #set the coordinate reference system - UTM 16N NAD83 meters accident_sf <- accident %>% st_as_sf(., coords = c("longitude", "latitude"), crs = 4326) %>% st_transform(., crs = 32616) %>% st_sf() #aggregate the accident alerts for each hexagonal grid cell (fishnet) accidentFishnet <- st_intersection(fishnet, accident_sf) %>% group_by(fishnet_id) %>% summarise(accCount = n())

9.10 Join OSM and weather API data, accident alerts and holiday data to the jams and imputed jams dataframes

#JOIN API DATA, ACCIDENT ALERTS AND HOLIDAY DATA TO THE JAMS AND IMPUTED DATAFRAME #drop geometry - convert sf object to dataframe fishnetOSM_df <- fishnetOSM %>% st_set_geometry(NULL) accidentFishnet_df <- accidentFishnet %>% st_set_geometry(NULL) #join OSM variables and accident alerts by fishnet_id variable #join with jamsFishnetDir dataframe jamsFishnetDir <- jamsFishnetDir %>% left_join(., fishnetOSM_df, by = "fishnet_id") jamsFishnetDir <- jamsFishnetDir %>% left_join(., accidentFishnet_df, by = "fishnet_id") #join with imputedDF dataframe imputedDF <- imputedDF %>% left_join(., fishnetOSM_df, by = "fishnet_id") imputedDF <- imputedDF %>% left_join(., accidentFishnet_df, by = "fishnet_id") #create time variable for weather data weather <- weather %>% mutate (dateTimeObject = as.POSIXct(time, tz="UTZ", format ="%Y-%m-%d %H:%M:%S"), local_time = as.POSIXct(time, tz="UTZ", format ="%Y-%m-%d %H:%M:%S") #join weather data by time to jams and imputed data jamsFishnetDir <- jamsFishnetDir %>% left_join(., weather, by="dateTimeObject") imputedDF <- imputedDF %>% left_join(., weather, by="local_time") #load holiday data holiday <- read.csv("Holiday2018.csv") #create time variable for holiday data holiday <- holiday %>% mutate (dateTimeObject = as.POSIXct(time, tz="UTZ", format ="%Y-%m-%d %H:%M:%S"), local_time = as.POSIXct(time, tz="UTZ", format ="%Y-%m-%d %H:%M:%S") #join weather data by time to jams and imputed data jamsFishnetDir <- jamsFishnetDir %>% left_join(., holiday, by="dateTimeObject") imputedDF <- imputedDF %>% left_join(., holiday, by="local_time")

9.11 Create final dataframes

#CREATE FINAL DATAFRAMES #non aggregated/non-imputed jams data finalDF <- jamsFishnetDir rm(jamsFishnetDir) #aggregated by hour/imputed jams data imputedDF <- imputedDF

9.13 Select relevant features from imputed dataframe and run correlation

#SELECT ALL FEATURES FROM IMPUTED JAMS DATAFRAME AND RUN CORRELATION #ADD BINARY FEATURES - FEATURE ENGINEERING #add binary variables - peak, weekend and freeway finalDF <- finalDF %>% mutate(weekend = ifelse(weekday %in% c("Saturday", "Sunday"), 1, 0), #Saturday, Sunday peak = ifelse(hour %in% c(7,8,18,19), 1,0), #peak hr 7am-9am and 6pm-8pm freeway = freeway = as.factor(if_else(road_type==3,1,0)) #highways, freeway, etc. #reorder column names col_order <- c("fishnet_id","local_time","date","icon","weekday","holiday", "weekend","peak", "freeway", "precipProbability","temperature", "humidity","pressure","windSpeed", "cloudCover","visibility", "month","day","hour", "count", "parking_ct2", "turning_ct","roundabt_ct", "sign_ct","cross_ct","toll_ct","traffic_signal","junction", "off_ct","ind_ct","comm_ct","res_ct","ret_cnt","tot_blng_cnt", "al_rel","al_count","interaction","length", "jamQtl","blngQtl", "geometry") #rearrange columns imputedDF_Features <- imputedDF[, col_order] names(imputedDF_Features) #list of names for columns columnName_lst <- list("Fishnet ID", "Local_Time", "Date","Weather_Event","Weekday","Holiday", "Weekend", "Peak", "Freeway", "Precipitation_Probability", "Temperature","Humidity","Pressure","Wind Speed", "Cloud Cover","Visibility", "Month", "Day","Hour", "Jam_Count","Parking","Turning_Circles","Roundabouts", "Road_Signs","Crossings","Tolls","Traffic Signals","Junctions", "Office_Building", "Industrial_Building","Commercial_Building", "Residential_Building","Retail_Building","Total_Buildings", "AccidentAlert_Probability","AccidentAlert_Counts", "Hour_Day_Interaction","Length", "jamQtl","blngQtl", "geometry") #rename column for (i in 1:length(columnName_lst)) { columnName = columnName_lst[[i]] names(imputedDF_Features)[i]<-paste(columnName) } names(imputedDF_Features) #RUN CORRELATION MATRIX FOR ALL NUMERIC FEATURES #select variables for correlation plot corr_df <- imputedDF_Features %>% dplyr::select(7:37) M <- cor(corr_df) corrplot(M, method = "number")

9.13 Feature selection using the Boruta Package

#FEATURE SELCTION USING THE BORUTA PACKAGE #randomly pick a week in 2018 featureDF <- imputedDF %>% filter(Date %in% c("2018-07-08", "2018-07-09", "2018-07-10", "2018-07-11", "2018-07-12", "2018-07-13", "2018-07-14")) #select all features and dependent variable in our dataset featureDF <- featureDF %>% dplyr::select(4:38) #names(featureDF) #impute any missing values featureDF <- featureDF[complete.cases(featureDF),] #create vector of column numbers that have categorical variables convert <- c(1,2,3) #convert categorical variables into factor data type featureDF[,convert] <- data.frame(apply(featureDF[convert], 2, as.factor)) #run Boruta model set.seed(123) borutaTrain <- Boruta(Length ~., data = featureDF, doTrace = 2) #print(borutaTrain) borutaTrainDF <- attStats(borutaTrain) #get final selection finalBoruta <- TentativeRoughFix(borutaTrain) #print(finalBoruta) borutaFinalDF <- attStats(finalBoruta)

9.14 Prepare imputed data for modeling

#PREPARE IMPUTED DATA FOR MODELING #REMOVE OUTLIERS FROM DATASET #filter out dates relating to flood events in February 2018 imputedDF <- imputedDF %>% filter(date %notin% c("2018-02-23", "2018-02-24", "2018-02-25", "2018-02-26", "2018-02-27", "2018-02-28")) #SELECT RELVEANT FEATURES #exclude features as per analysis under feature selection using imputedDF <- imputedDF %>% dplyr::select(-precipProbability, -month, -off_ct, -roundabt_ct, -holiday, -date, -geometry,)

9.15 Prophet Model

####Regressor#### weekend <- function(ds) { dates <- as.Date(ds) weekday<- weekdays(dates) as.numeric((weekday=="Sunday")|(weekday=="Saturday")) } peakHour <- function(ds) { dates <- as.POSIXct(ds) hour<- hour(dates) weekday <- weekend(ds) as.numeric(((hour >=7 & hour <=10) | (hour >=18 & hour <=20)) & (weekday=='0')) } m <- prophet(holidays = hol, holidays.prior.scale = 5, daily.seasonality = TRUE, weekly.seasonality = TRUE, yearly.seasonality = TRUE, growth = "linear") m2 <- m%>% add_regressor('snow',standardize = FALSE) %>% add_regressor('rain',standardize = FALSE) %>% add_regressor('cloudy',standardize = FALSE) %>% add_regressor('weekend',standardize = FALSE)%>% add_regressor('peakHour',standardize = FALSE) m_train <- fit.prophet(m2, Train_length) pre_train <- predict(m_train,Train_length) %>% left_join(Train_length, by = 'ds') %>% mutate(mae = abs(yhat - y), mape = abs(yhat - y)/y) mean(pre_train$mae) mean(pre_train$mape) sqrt(mean(pre_train$mae^2)) pre_test <- predict(m_train,Test_length) %>% left_join(Test_length, by = 'ds') %>% mutate(mae = abs(yhat - y), mape = abs(yhat - y)/y) mean(pre_test$mae) mean(pre_test$mape) sqrt(mean(pre_test$mae^2)) plot(m_train, pre_test) + labs(title="Predicted Length", subtitle = "Fishnet 6696")+ theme_bw() + coord_cartesian(ylim = c(0,1000)) prophet_plot_components(m_train, pre_test) ggplot() + geom_point(data=pre_test, aes(y, yhat),size=1) + stat_smooth(data=pre_test, aes(y, yhat), method = "lm", se = FALSE, size = 1, colour="blue") + labs(title="Predicted Length as a function of Observed Length", subtitle = "Fishnet 6696") + theme(plot.title = element_text(size = 14,colour = "black"), plot.subtitle=element_text(face="italic"))+ xlim(0, 800)+ theme_bw() ####Cross-validation#### x <- cross_validation(m_train, initial = 1680, period = 24,horizon = 100,units = 'hours') x_ <- as.data.frame(performance_metrics(x)) mean(x_$mape) mean(x_$mae) mean(x_$rmse) plot_cross_validation_metric(x, metric = 'mape') ggplot(x_, aes(x=mape)) + geom_histogram(bins=8) + labs(x="MAPE", y="Count", title="Cross-validation of MAPE")+ theme_bw()

9.16 Random Forest Model

#RANDOM FOREST MODEL #CREATE FUNCTIONS TO RUN RANDOM FOREST MODEL AND GENERATE PREDICTIONS #function to generate predictions and metrics for train dataset rfTrainPredFunction <- function (df, fishnetID) { #filter out relevant fishnet fishnetDF <- df %>% filter(fishnet_id == fishnetID) #partition data into train and test inTrain <- createDataPartition ( y = modelDF$length, p = .80, list = FALSE) trainDF <- modelDF[inTrain,] testDF <- modelDF[-inTrain,] #run random forest model rfModel <- randomForest(length ~ ., data = trainDF, ntree = 500, importance = TRUE) #generate predictions for train data trainDF$fitted <- predict(rfModel, trainDF) #convert dependent variable from seconds to minutes; compute and add columns for relevant goodness of fit metrics trainDF <- trainDF %>% mutate(lengthMin = length/60, fittedMin = fitted/60, MAE = abs(lengthMin - fittedMin), MAPE = abs((lengthMin - fittedMin)/fittedMin), RMSE = sqrt(mean(abs(lengthMin-fittedMin)^2))) #get mean of metrics output <- trainDF %>% summarise(Observed = mean(lengthMin), Predicted = mean(fitedMin), MAE = mean(MAE), MAPE = mean(MAPE), RMSE = mean(RMSE)) return(output) } #function to generate predictions and metrics for test dataset rfTestPredFunction <- function (df, fishnetID) { #filter out relevant fishnet fishnetDF <- df %>% filter(fishnet_id == fishnetID) #partition data into train and test inTrain <- createDataPartition ( y = modelDF$length, p = .80, list = FALSE) trainDF <- modelDF[inTrain,] testDF <- modelDF[-inTrain,] #run random forest model rfModel <- randomForest(length ~ ., data = trainDF, ntree = 500, importance = TRUE) #generate predictions for test data testDF$fitted <- predict(rfModel, testDF) #convert dependent variable from seconds to minutes; compute and add columns for relevant goodness of fit metrics testDF <- testDF %>% mutate(lengthMin = length/60, fittedMin = fitted/60, MAE = abs(lengthMin - fittedMin), MAPE = abs((lengthMin - fittedMin)/fittedMin), RMSE = sqrt(mean(abs(lengthMin-fittedMin)^2))) #get mean of metrics output <- testDF %>% summarise(Observed = mean(lengthMin), Predicted = mean(fitedMin), MAE = mean(MAE), MAPE = mean(MAPE), RMSE = mean(RMSE)) return(output) } #list of fishnets selected fishnet_lst <- list("6696", "7640", "5386", "3206") #run model for all fishnets selected #create empty dataframe for random forest results rfResults <- data.frame() #use 'for' loop to loop through all the fishnets in the list for (i in fishnet_lst) { #extract fishnet ID fishnetID <- i #get goodness of fit metrics for train data trainPred <- rfTrainPredFunction(imputedDF, fishnetID) #indicate what fishnet ID and dataset type i.e., train or test trainPred <- trainPred %>% mutate(FishnetID = fishnetID, Type = "Train") #get goodness of fit metrics for test data testPred <- rfTestPredFunction(imputedDF, fishnetID) #indicate what fishnet ID and dataset type i.e., train or test testPred <- testPred %>% mutate(FishnetID = fishnetID, Type = "Test") #rbind results rfResults <- rbind(rfResults, trainPred, testPred) } #rfResults

9.17 Mixed Effect Model