Homework 2 & 3

Part B

library("data.table")
set.seed(123)

Since we chose to tackle the parts of the questions seperately, we will read in and change the data as necessary again.

#Reading and editing data
consumption=fread("GercekZamanliTuketim-01012016-19052020.csv")
setnames(consumption,names(consumption)[3],'value')
consumption[,date:=as.Date(Tarih,'%d.%m.%Y')]
consumption[,hour:=as.numeric(substr(Saat,1,2))]
consumption=consumption[,list(date,hour,value)]
consumption[,value:=gsub(".", "",value, fixed = TRUE)]
consumption[,value:=as.numeric(gsub(",", ".",value, fixed = TRUE))]

consumption[,lag_168:=shift(value,168)]
consumption[,lag_48:=shift(value,48)]

full_consumption=consumption[complete.cases(consumption)]
head(full_consumption)

##          date hour    value  lag_168   lag_48
## 1: 2016-01-08    0 28602.02 26277.24 29189.27
## 2: 2016-01-08    1 27112.37 24991.82 27614.02
## 3: 2016-01-08    2 25975.34 23532.61 26578.97
## 4: 2016-01-08    3 25315.55 22464.78 25719.19
## 5: 2016-01-08    4 25128.15 22002.91 25864.63
## 6: 2016-01-08    5 25356.22 21957.08 25918.59

We filtered the data in long format.

# Filter consumption data in long format
long_tr = full_consumption[date < '2020-03-01']
long_te = full_consumption[date >= '2020-03-01']

We fit a linear regression model using lag 48 and lag 168, using training data. We made the predictions on test data.

# Fit a linear regression model and make a prediction
long_fit = lm(value~lag_48+lag_168, long_tr)
long_pred = predict(long_fit, long_te)
summary(long_fit)

## 
## Call:
## lm(formula = value ~ lag_48 + lag_168, data = long_tr)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16854.6   -977.3     -7.7    994.7  16055.2 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.633e+03  9.207e+01   17.74   <2e-16 ***
## lag_48      3.111e-01  3.345e-03   92.99   <2e-16 ***
## lag_168     6.394e-01  3.341e-03  191.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2371 on 36309 degrees of freedom
## Multiple R-squared:  0.7676, Adjusted R-squared:  0.7676 
## F-statistic: 5.996e+04 on 2 and 36309 DF,  p-value: < 2.2e-16

We calculated the APE and MAPE for lag_48 and lag_168; and created the long format table.

# Create long format table & calculate APE and MAPE for lag_48 and lag_168
long_predicted = long_te[, predicted_consumption := long_pred]
long_predicted = long_predicted[, APE_LR:=(abs(long_predicted$value-long_predicted$predicted_consumption)/abs(long_predicted$value))*100]
MAPE_LR = mean(long_predicted$APE_LR)

We also checked the summary statistics and plotted a boxplot.

full_consumption[,ape_168:=abs(full_consumption$value-full_consumption$lag_168)/full_consumption$value*100] #absolute percentage error
full_consumption[,ape_48:=abs(full_consumption$value-full_consumption$lag_48)/full_consumption$value*100]

boxplot(full_consumption$ape_168,full_consumption$ape_48,long_predicted$APE_LR,names=c("Lag 168","Lag 48","Linear Regression"))
title("Absolute Percentage Error")

summary(long_predicted$APE_LR)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.01148  1.83166  4.03732  5.69813  7.61776 41.19968

quantile_lin_reg=quantile(long_predicted$APE_LR, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_lin_reg

##        10%        25%        50%        75%        90% 
##  0.7542483  1.8316559  4.0373182  7.6177555 12.5462119

Comparing the boxplot of the linear regression model with lag forecasting, we see that the regression boxplot is more similar to the lag_168 forecast. The main difference is that the second half of the interquartile range (Q3-Q2) is wider (has little more variance) and has slightly more outliers. We can conclude that the consumption of previous week’s same time is pretty much as good as using linear regression with 48 and 168 hour lags.