library("data.table")
library("rlist")
# Read in data and arrange variable types
consumption = fread("GercekZamanliTuketim-01012016-19052020.csv")
setnames(consumption,names(consumption)[3],'value')
consumption[,date:=as.Date(Tarih,'%d.%m.%Y')]
consumption[,hour:=as.numeric(substr(Saat,1,2))]
consumption=consumption[,list(date,hour,value)]
consumption[,value:=gsub(".", "",value, fixed = TRUE)]
consumption[,value:=as.numeric(gsub(",", ".",value, fixed = TRUE))]
# Shift data and clear N/A
consumption = consumption[,lag_48:=shift(consumption[,3],48)]
consumption = consumption[,lag_168:=shift(consumption[,3],168)]
full_consumption = consumption[complete.cases(consumption)]
We created training and test sets in wide format.
# Create training and test consumption data in wide format with lag_168
long_tr = full_consumption[date < '2020-03-01']
long_te = full_consumption[date >= '2020-03-01']
wide_tr = merge(dcast(long_tr,date~paste0('actual_hour_',hour), value.var='value'), dcast(long_tr,date~paste0('lag_168_hour_',hour),value.var='lag_168'), by = 'date')
wide_te = merge(dcast(long_te,date~paste0('actual_hour_',hour), value.var='value'), dcast(long_te,date~paste0('lag_168_hour_',hour),value.var='lag_168'), by = 'date')
We fitted the linear regression models for each hour, using training dataset.
# Fit linear regression models
wide_fit_0 = lm(actual_hour_0~lag_168_hour_0, wide_tr)
wide_fit_1 = lm(actual_hour_1~lag_168_hour_1, wide_tr)
wide_fit_2 = lm(actual_hour_2~lag_168_hour_2, wide_tr)
wide_fit_3 = lm(actual_hour_0~lag_168_hour_3, wide_tr)
wide_fit_4 = lm(actual_hour_0~lag_168_hour_4, wide_tr)
wide_fit_5 = lm(actual_hour_0~lag_168_hour_5, wide_tr)
wide_fit_6 = lm(actual_hour_0~lag_168_hour_6, wide_tr)
wide_fit_7 = lm(actual_hour_0~lag_168_hour_7, wide_tr)
wide_fit_8 = lm(actual_hour_0~lag_168_hour_8, wide_tr)
wide_fit_9 = lm(actual_hour_0~lag_168_hour_9, wide_tr)
wide_fit_10 = lm(actual_hour_0~lag_168_hour_10, wide_tr)
wide_fit_11 = lm(actual_hour_0~lag_168_hour_11, wide_tr)
wide_fit_12 = lm(actual_hour_0~lag_168_hour_12, wide_tr)
wide_fit_13 = lm(actual_hour_0~lag_168_hour_13, wide_tr)
wide_fit_14 = lm(actual_hour_0~lag_168_hour_14, wide_tr)
wide_fit_15 = lm(actual_hour_0~lag_168_hour_15, wide_tr)
wide_fit_16 = lm(actual_hour_0~lag_168_hour_16, wide_tr)
wide_fit_17 = lm(actual_hour_0~lag_168_hour_17, wide_tr)
wide_fit_18 = lm(actual_hour_0~lag_168_hour_18, wide_tr)
wide_fit_19 = lm(actual_hour_0~lag_168_hour_19, wide_tr)
wide_fit_20 = lm(actual_hour_0~lag_168_hour_20, wide_tr)
wide_fit_21 = lm(actual_hour_0~lag_168_hour_21, wide_tr)
wide_fit_22 = lm(actual_hour_0~lag_168_hour_22, wide_tr)
wide_fit_23 = lm(actual_hour_0~lag_168_hour_23, wide_tr)
We performed the predictions using the lm models for each hour, and created a wide format table with predicted consumption.
# Make prediction with linear regression models
wide_pred_0 = predict(wide_fit_0, wide_te)
wide_pred_1 = predict(wide_fit_1, wide_te)
wide_pred_2 = predict(wide_fit_2, wide_te)
wide_pred_3 = predict(wide_fit_3, wide_te)
wide_pred_4 = predict(wide_fit_4, wide_te)
wide_pred_5 = predict(wide_fit_5, wide_te)
wide_pred_6 = predict(wide_fit_6, wide_te)
wide_pred_7 = predict(wide_fit_7, wide_te)
wide_pred_8 = predict(wide_fit_8, wide_te)
wide_pred_9 = predict(wide_fit_9, wide_te)
wide_pred_10 = predict(wide_fit_10, wide_te)
wide_pred_11 = predict(wide_fit_11, wide_te)
wide_pred_12 = predict(wide_fit_12, wide_te)
wide_pred_13 = predict(wide_fit_13, wide_te)
wide_pred_14 = predict(wide_fit_14, wide_te)
wide_pred_15 = predict(wide_fit_15, wide_te)
wide_pred_16 = predict(wide_fit_16, wide_te)
wide_pred_17 = predict(wide_fit_17, wide_te)
wide_pred_18 = predict(wide_fit_18, wide_te)
wide_pred_19 = predict(wide_fit_19, wide_te)
wide_pred_20 = predict(wide_fit_20, wide_te)
wide_pred_21 = predict(wide_fit_21, wide_te)
wide_pred_22 = predict(wide_fit_22, wide_te)
wide_pred_23 = predict(wide_fit_23, wide_te)
# Create wide format table with predicted consumption
wide_predicted = wide_te[, 1:25]
wide_predicted = wide_predicted[, predicted_0 := wide_pred_0]
wide_predicted = wide_predicted[, predicted_1 := wide_pred_1]
wide_predicted = wide_predicted[, predicted_2 := wide_pred_2]
wide_predicted = wide_predicted[, predicted_3 := wide_pred_3]
wide_predicted = wide_predicted[, predicted_4 := wide_pred_4]
wide_predicted = wide_predicted[, predicted_5 := wide_pred_5]
wide_predicted = wide_predicted[, predicted_6 := wide_pred_6]
wide_predicted = wide_predicted[, predicted_7 := wide_pred_7]
wide_predicted = wide_predicted[, predicted_8 := wide_pred_8]
wide_predicted = wide_predicted[, predicted_9 := wide_pred_9]
wide_predicted = wide_predicted[, predicted_10 := wide_pred_10]
wide_predicted = wide_predicted[, predicted_11 := wide_pred_11]
wide_predicted = wide_predicted[, predicted_12 := wide_pred_12]
wide_predicted = wide_predicted[, predicted_13 := wide_pred_13]
wide_predicted = wide_predicted[, predicted_14 := wide_pred_14]
wide_predicted = wide_predicted[, predicted_15 := wide_pred_15]
wide_predicted = wide_predicted[, predicted_16 := wide_pred_16]
wide_predicted = wide_predicted[, predicted_17 := wide_pred_17]
wide_predicted = wide_predicted[, predicted_18 := wide_pred_18]
wide_predicted = wide_predicted[, predicted_19 := wide_pred_19]
wide_predicted = wide_predicted[, predicted_20 := wide_pred_20]
wide_predicted = wide_predicted[, predicted_21 := wide_pred_21]
wide_predicted = wide_predicted[, predicted_22 := wide_pred_22]
wide_predicted = wide_predicted[, predicted_23 := wide_pred_23]
We calculated the APE and MAPE for predictions.
# calculate APE for predicted consumption
wide_predicted = wide_predicted[, APE_H0:=(abs(wide_predicted$actual_hour_0-wide_predicted$predicted_0)/abs(wide_predicted$actual_hour_0))*100]
wide_predicted = wide_predicted[, APE_H1:=(abs(wide_predicted$actual_hour_1-wide_predicted$predicted_1)/abs(wide_predicted$actual_hour_1))*100]
wide_predicted = wide_predicted[, APE_H2:=(abs(wide_predicted$actual_hour_2-wide_predicted$predicted_2)/abs(wide_predicted$actual_hour_2))*100]
wide_predicted = wide_predicted[, APE_H3:=(abs(wide_predicted$actual_hour_3-wide_predicted$predicted_3)/abs(wide_predicted$actual_hour_3))*100]
wide_predicted = wide_predicted[, APE_H4:=(abs(wide_predicted$actual_hour_4-wide_predicted$predicted_4)/abs(wide_predicted$actual_hour_4))*100]
wide_predicted = wide_predicted[, APE_H5:=(abs(wide_predicted$actual_hour_5-wide_predicted$predicted_5)/abs(wide_predicted$actual_hour_5))*100]
wide_predicted = wide_predicted[, APE_H6:=(abs(wide_predicted$actual_hour_6-wide_predicted$predicted_6)/abs(wide_predicted$actual_hour_6))*100]
wide_predicted = wide_predicted[, APE_H7:=(abs(wide_predicted$actual_hour_7-wide_predicted$predicted_7)/abs(wide_predicted$actual_hour_7))*100]
wide_predicted = wide_predicted[, APE_H8:=(abs(wide_predicted$actual_hour_8-wide_predicted$predicted_8)/abs(wide_predicted$actual_hour_8))*100]
wide_predicted = wide_predicted[, APE_H9:=(abs(wide_predicted$actual_hour_9-wide_predicted$predicted_9)/abs(wide_predicted$actual_hour_9))*100]
wide_predicted = wide_predicted[, APE_H10:=(abs(wide_predicted$actual_hour_10-wide_predicted$predicted_10)/abs(wide_predicted$actual_hour_10))*100]
wide_predicted = wide_predicted[, APE_H11:=(abs(wide_predicted$actual_hour_11-wide_predicted$predicted_11)/abs(wide_predicted$actual_hour_11))*100]
wide_predicted = wide_predicted[, APE_H12:=(abs(wide_predicted$actual_hour_12-wide_predicted$predicted_12)/abs(wide_predicted$actual_hour_12))*100]
wide_predicted = wide_predicted[, APE_H13:=(abs(wide_predicted$actual_hour_13-wide_predicted$predicted_13)/abs(wide_predicted$actual_hour_13))*100]
wide_predicted = wide_predicted[, APE_H14:=(abs(wide_predicted$actual_hour_14-wide_predicted$predicted_14)/abs(wide_predicted$actual_hour_14))*100]
wide_predicted = wide_predicted[, APE_H15:=(abs(wide_predicted$actual_hour_15-wide_predicted$predicted_15)/abs(wide_predicted$actual_hour_15))*100]
wide_predicted = wide_predicted[, APE_H16:=(abs(wide_predicted$actual_hour_16-wide_predicted$predicted_16)/abs(wide_predicted$actual_hour_16))*100]
wide_predicted = wide_predicted[, APE_H17:=(abs(wide_predicted$actual_hour_17-wide_predicted$predicted_17)/abs(wide_predicted$actual_hour_17))*100]
wide_predicted = wide_predicted[, APE_H18:=(abs(wide_predicted$actual_hour_18-wide_predicted$predicted_18)/abs(wide_predicted$actual_hour_18))*100]
wide_predicted = wide_predicted[, APE_H19:=(abs(wide_predicted$actual_hour_19-wide_predicted$predicted_19)/abs(wide_predicted$actual_hour_19))*100]
wide_predicted = wide_predicted[, APE_H20:=(abs(wide_predicted$actual_hour_20-wide_predicted$predicted_20)/abs(wide_predicted$actual_hour_20))*100]
wide_predicted = wide_predicted[, APE_H21:=(abs(wide_predicted$actual_hour_21-wide_predicted$predicted_21)/abs(wide_predicted$actual_hour_21))*100]
wide_predicted = wide_predicted[, APE_H22:=(abs(wide_predicted$actual_hour_22-wide_predicted$predicted_22)/abs(wide_predicted$actual_hour_22))*100]
wide_predicted = wide_predicted[, APE_H23:=(abs(wide_predicted$actual_hour_23-wide_predicted$predicted_23)/abs(wide_predicted$actual_hour_23))*100]
# calculate MAPE for predicted consumption
wide_MAPE_0 = mean(wide_predicted$APE_H0)
wide_MAPE_1 = mean(wide_predicted$APE_H1)
wide_MAPE_2 = mean(wide_predicted$APE_H2)
wide_MAPE_3 = mean(wide_predicted$APE_H3)
wide_MAPE_4 = mean(wide_predicted$APE_H4)
wide_MAPE_5 = mean(wide_predicted$APE_H5)
wide_MAPE_6 = mean(wide_predicted$APE_H6)
wide_MAPE_7 = mean(wide_predicted$APE_H7)
wide_MAPE_8 = mean(wide_predicted$APE_H8)
wide_MAPE_9 = mean(wide_predicted$APE_H9)
wide_MAPE_10 = mean(wide_predicted$APE_H10)
wide_MAPE_11 = mean(wide_predicted$APE_H11)
wide_MAPE_12 = mean(wide_predicted$APE_H12)
wide_MAPE_13 = mean(wide_predicted$APE_H13)
wide_MAPE_14 = mean(wide_predicted$APE_H14)
wide_MAPE_15 = mean(wide_predicted$APE_H15)
wide_MAPE_16 = mean(wide_predicted$APE_H16)
wide_MAPE_17 = mean(wide_predicted$APE_H17)
wide_MAPE_18 = mean(wide_predicted$APE_H18)
wide_MAPE_19 = mean(wide_predicted$APE_H19)
wide_MAPE_20 = mean(wide_predicted$APE_H20)
wide_MAPE_21 = mean(wide_predicted$APE_H21)
wide_MAPE_22 = mean(wide_predicted$APE_H22)
wide_MAPE_23 = mean(wide_predicted$APE_H23)
We checked out the summary statistics to better understand the prediction error distributions.
# Display summary statistics
summary(wide_predicted$APE_H0)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03091 1.25424 3.71368 4.37286 6.53782 17.12464
quantile(wide_predicted$APE_H0, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 0.4862432 1.2542413 3.7136794 6.5378200 8.5477957
summary(wide_predicted$APE_H1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05774 1.46751 4.05013 4.42039 6.52903 17.29339
quantile(wide_predicted$APE_H1, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 0.5616984 1.4675070 4.0501246 6.5290336 8.3936829
summary(wide_predicted$APE_H2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01982 1.37021 4.52609 4.65492 7.36666 17.01919
quantile(wide_predicted$APE_H2, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 0.5973676 1.3702126 4.5260922 7.3666602 8.3543642
summary(wide_predicted$APE_H3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.749 11.638 15.091 15.564 20.824 33.057
quantile(wide_predicted$APE_H3, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 7.41536 11.63776 15.09053 20.82352 22.20529
summary(wide_predicted$APE_H4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.811 13.383 16.994 17.371 22.431 34.490
quantile(wide_predicted$APE_H4, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 8.980602 13.383169 16.993887 22.430710 24.322582
summary(wide_predicted$APE_H5)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.514 15.664 19.542 19.694 23.730 36.515
quantile(wide_predicted$APE_H5, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 12.51076 15.66442 19.54248 23.73049 27.08139
summary(wide_predicted$APE_H6)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.246 17.061 23.371 22.728 27.755 44.199
quantile(wide_predicted$APE_H6, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 10.41912 17.06122 23.37100 27.75451 33.04800
summary(wide_predicted$APE_H7)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.78 16.44 21.86 22.22 28.50 51.15
quantile(wide_predicted$APE_H7, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 5.449933 16.435842 21.859515 28.501244 38.793394
summary(wide_predicted$APE_H8)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.512 8.476 13.520 17.442 20.383 60.771
quantile(wide_predicted$APE_H8, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 5.328471 8.476098 13.519514 20.382856 36.988564
summary(wide_predicted$APE_H9)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7052 5.8566 11.7804 14.9110 17.3066 62.5129
quantile(wide_predicted$APE_H9, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 2.169617 5.856620 11.780439 17.306552 34.577273
summary(wide_predicted$APE_H10)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02245 3.57588 9.85948 12.83456 18.56747 58.27387
quantile(wide_predicted$APE_H10, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.787553 3.575880 9.859479 18.567472 30.180950
summary(wide_predicted$APE_H11)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2929 3.8709 7.2819 11.3361 15.4001 51.7869
quantile(wide_predicted$APE_H11, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.223263 3.870909 7.281891 15.400133 25.356553
summary(wide_predicted$APE_H12)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1122 2.9461 6.9760 10.0373 13.6592 47.6097
quantile(wide_predicted$APE_H12, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.576649 2.946093 6.975971 13.659158 21.724151
summary(wide_predicted$APE_H13)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1414 3.3861 6.1866 9.2992 12.8039 45.0147
quantile(wide_predicted$APE_H13, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.023881 3.386071 6.186594 12.803914 20.659264
summary(wide_predicted$APE_H14)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04451 3.69817 6.97402 9.41273 13.15296 44.42113
quantile(wide_predicted$APE_H14, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 0.9512929 3.6981740 6.9740182 13.1529627 19.4100127
summary(wide_predicted$APE_H15)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05971 3.76325 6.75082 8.93436 12.30857 40.05998
quantile(wide_predicted$APE_H15, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.443558 3.763253 6.750822 12.308567 18.467889
summary(wide_predicted$APE_H16)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.161 3.873 7.121 8.606 11.883 34.222
quantile(wide_predicted$APE_H16, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 2.231547 3.873087 7.120862 11.882563 17.557349
summary(wide_predicted$APE_H17)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2683 3.7194 6.8442 8.0131 10.5073 24.9968
quantile(wide_predicted$APE_H17, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.968434 3.719404 6.844190 10.507338 14.668786
summary(wide_predicted$APE_H18)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02617 3.67857 6.98089 7.49116 10.51716 19.47231
quantile(wide_predicted$APE_H18, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.781759 3.678568 6.980888 10.517157 14.178669
summary(wide_predicted$APE_H19)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1149 6.5742 8.3222 9.0132 12.5364 19.7619
quantile(wide_predicted$APE_H19, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 2.822575 6.574154 8.322249 12.536404 15.205228
summary(wide_predicted$APE_H20)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1998 6.4409 8.5501 9.2051 12.4935 18.5262
quantile(wide_predicted$APE_H20, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 4.505608 6.440950 8.550050 12.493457 14.458362
summary(wide_predicted$APE_H21)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02217 5.07573 8.12066 8.09262 11.33670 16.30168
quantile(wide_predicted$APE_H21, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 2.940776 5.075735 8.120663 11.336701 13.151602
summary(wide_predicted$APE_H22)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2275 3.7274 6.0373 6.4651 9.4742 14.6195
quantile(wide_predicted$APE_H22, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 1.558514 3.727411 6.037332 9.474173 10.966660
summary(wide_predicted$APE_H23)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05222 1.51538 3.28717 4.07788 6.30070 12.25404
quantile(wide_predicted$APE_H23, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
## 10% 25% 50% 75% 90%
## 0.2756153 1.5153759 3.2871658 6.3007031 9.0123930
We also plotted a boxplot to see the variability visually.
# Plot a boxplot for hourly APE
boxplot(wide_predicted$APE_H1, wide_predicted$APE_H2, wide_predicted$APE_H3, wide_predicted$APE_H4,wide_predicted$APE_H5,wide_predicted$APE_H6, wide_predicted$APE_H7, wide_predicted$APE_H8, wide_predicted$APE_H9, wide_predicted$APE_H10,wide_predicted$APE_H11, wide_predicted$APE_H12, wide_predicted$APE_H13, wide_predicted$APE_H14, wide_predicted$APE_H15, wide_predicted$APE_H16, wide_predicted$APE_H17, wide_predicted$APE_H18, wide_predicted$APE_H19, wide_predicted$APE_H20, wide_predicted$APE_H21, wide_predicted$APE_H22, wide_predicted$APE_H23, wide_predicted$APE_H0)
From the box plot, first thing we see is that consumption is higher in the morning. When we chek the deviations we see that night hours has smaller deviation and less number of outliers than that of during the day. We can also say that when mean error increase in prediction, max error values and number of outliers also increase.
Additional Note: We are aware of that there might be a better way of code but we kept our focus on the result rather than the scripting language.