Prepared with Ceren Demirkol, Okan Güven, Sevgican Varol
require(data.table)
set.seed(123)
#Getting data
consumption=fread("C:/Users/ceren.orhan/Desktop/ETM 58D/HW2-3/GercekZamanliTuketim-01012016-19052020.csv")
setnames(consumption,names(consumption)[3],'value')
consumption[,date:=as.Date(Tarih,'%d.%m.%Y')]
consumption[,hour:=as.numeric(substr(Saat,1,2))]
consumption=consumption[,list(date,hour,value)]
consumption[,value:=gsub(".", "",value, fixed = TRUE)]
consumption[,value:=as.numeric(gsub(",", ".",value, fixed = TRUE))]
consumption[,lag_168:=shift(value,168)]
consumption[,lag_48:=shift(value,48)]
full_consumption=consumption[complete.cases(consumption)]
head(full_consumption)
# Filter consumption data in long format
long_tr = full_consumption[date < '2020-03-01']
long_te = full_consumption[date >= '2020-03-01']
# Fit a linear regression model and make a prediction
long_fit = lm(value~lag_48+lag_168, long_tr)
long_pred = predict(long_fit, long_te)
summary(long_fit)
# Create long format table & calculate APE and MAPE for lag_48 and lag_168
long_predicted = long_te[, predicted_consumption := long_pred]
long_predicted = long_predicted[, APE_LR:=(abs(long_predicted$value-long_predicted$predicted_consumption)/abs(long_predicted$value))*100]
MAPE_LR = mean(long_predicted$APE_LR)
# Fit a linear regression model and make a prediction
long_fit = lm(value~lag_48+lag_168, long_tr)
long_pred = predict(long_fit, long_te)
summary(long_fit)
# Create long format table & calculate APE and MAPE for lag_48 and lag_168
long_predicted = long_te[, predicted_consumption := long_pred]
long_predicted = long_predicted[, APE_LR:=(abs(long_predicted$value-long_predicted$predicted_consumption)/abs(long_predicted$value))*100]
MAPE_LR = mean(long_predicted$APE_LR)
full_consumption[,ape_168:=abs(full_consumption$value-full_consumption$lag_168)/full_consumption$value*100] #absolute percentage error
full_consumption[,ape_48:=abs(full_consumption$value-full_consumption$lag_48)/full_consumption$value*100]
boxplot(full_consumption$ape_168,full_consumption$ape_48,long_predicted$APE_LR,names=c("Lag 168","Lag 48","Linear Regression"))
title("Absolute Percentage Error")
summary(long_predicted$APE_LR)
quantile_lag_168=quantile(compare$ape_168, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_lag_48=quantile(compare$ape_48, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_lin_reg=quantile(long_predicted$APE_LR, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
q_all=cbind(quantile_lag_168,quantile_lag_48,quantile_lin_reg)
q_all
Comparing the boxplot of the linear regression model with lag forecasting, we see that the regression boxplot is more similar to the lag_168 forecast.
The main difference is that the second half of the interquartile range (Q3-Q2) is wider (has little more variance) and has slightly more outliers.
We can conclude that the consumption of previous week's same time is pretty much as good as using linear regression with 48 and 168 hour lags.