Prepared with Ceren Demirkol, Okan Güven, Sevgican Varol
require(data.table)
library(glmnet)
set.seed(123)
#Getting data
consumption=fread("C:/Users/ceren.orhan/Desktop/ETM 58D/HW2-3/GercekZamanliTuketim-01012016-19052020.csv")
setnames(consumption,names(consumption)[3],'value')
consumption[,date:=as.Date(Tarih,'%d.%m.%Y')]
consumption[,hour:=as.numeric(substr(Saat,1,2))]
consumption=consumption[,list(date,hour,value)]
consumption[,value:=gsub(".", "",value, fixed = TRUE)]
consumption[,value:=as.numeric(gsub(",", ".",value, fixed = TRUE))]
consumption[,lag_168:=shift(value,168)]
consumption[,lag_48:=shift(value,48)]
full_consumption=consumption[complete.cases(consumption)]
head(full_consumption)
# Filter consumption data in long format
long_tr = full_consumption[date < '2020-03-01']
long_te = full_consumption[date >= '2020-03-01']
# Create training and test consumption data in wide format with lag_168
wide_pen_tr = merge(dcast(long_tr, date~paste0('lag_48_hour_', hour), value.var='lag_48'), dcast(long_tr, date~paste0('lag_168_hour_', hour), value.var='lag_168'), by = 'date')
wide_pen_tr_ac = dcast(long_tr, date~paste0('actual_', hour), value.var='value')
wide_pen_te = merge(dcast(long_te, date~paste0('lag_48_hour_', hour), value.var='lag_48'), dcast(long_te, date~paste0('lag_168_hour_', hour), value.var='lag_168'), by = 'date')
wide_pen_te_ac = dcast(long_te, date~paste0('actual_', hour), value.var='value')
# Convert data frame to matrix
wide_tr_mat = data.matrix(wide_pen_tr)
wide_tr_ac_mat = data.matrix(wide_pen_tr_ac)
wide_te_mat = data.matrix(wide_pen_te)
# Find optimal lambda with cv
wide_fit_0 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 2], alpha = 1)
wide_fit_1 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 3], alpha = 1)
wide_fit_2 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 14], alpha = 1)
wide_fit_3 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 19], alpha = 1)
wide_fit_4 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 20], alpha = 1)
wide_fit_5 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 21], alpha = 1)
wide_fit_6 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 22], alpha = 1)
wide_fit_7 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 23], alpha = 1)
wide_fit_8 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 24], alpha = 1)
wide_fit_9 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 25], alpha = 1)
wide_fit_10 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 4], alpha = 1)
wide_fit_11 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 5], alpha = 1)
wide_fit_12 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 6], alpha = 1)
wide_fit_13 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 7], alpha = 1)
wide_fit_14 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 8], alpha = 1)
wide_fit_15 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 9], alpha = 1)
wide_fit_16 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 10], alpha = 1)
wide_fit_17 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 11], alpha = 1)
wide_fit_18 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 12], alpha = 1)
wide_fit_19 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 13], alpha = 1)
wide_fit_20 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 15], alpha = 1)
wide_fit_21 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 16], alpha = 1)
wide_fit_22 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 17], alpha = 1)
wide_fit_23 = cv.glmnet(wide_tr_mat, wide_tr_ac_mat[, 18], alpha = 1)
# Sample lamda plot for Hour 0
plot(wide_fit_0)
wide_fit_0$lambda.min
wide_fit_0$lambda.1se
# Fit lasso models
lasso_model_0 = glmnet(wide_tr_mat, wide_tr_ac_mat[,2], alpha = 1, family = "gaussian", lambda = wide_fit_0$lambda.1se)
lasso_model_1 = glmnet(wide_tr_mat, wide_tr_ac_mat[,3], alpha = 1, family = "gaussian", lambda = wide_fit_1$lambda.1se)
lasso_model_2 = glmnet(wide_tr_mat, wide_tr_ac_mat[,14], alpha = 1, family = "gaussian", lambda = wide_fit_2$lambda.1se)
lasso_model_3 = glmnet(wide_tr_mat, wide_tr_ac_mat[,19], alpha = 1, family = "gaussian", lambda = wide_fit_3$lambda.1se)
lasso_model_4 = glmnet(wide_tr_mat, wide_tr_ac_mat[,20], alpha = 1, family = "gaussian", lambda = wide_fit_4$lambda.1se)
lasso_model_5 = glmnet(wide_tr_mat, wide_tr_ac_mat[,21], alpha = 1, family = "gaussian", lambda = wide_fit_5$lambda.1se)
lasso_model_6 = glmnet(wide_tr_mat, wide_tr_ac_mat[,22], alpha = 1, family = "gaussian", lambda = wide_fit_6$lambda.1se)
lasso_model_7 = glmnet(wide_tr_mat, wide_tr_ac_mat[,23], alpha = 1, family = "gaussian", lambda = wide_fit_7$lambda.1se)
lasso_model_8 = glmnet(wide_tr_mat, wide_tr_ac_mat[,24], alpha = 1, family = "gaussian", lambda = wide_fit_8$lambda.1se)
lasso_model_9 = glmnet(wide_tr_mat, wide_tr_ac_mat[,25], alpha = 1, family = "gaussian", lambda = wide_fit_9$lambda.1se)
lasso_model_10 = glmnet(wide_tr_mat, wide_tr_ac_mat[,4], alpha = 1, family = "gaussian", lambda = wide_fit_10$lambda.1se)
lasso_model_11 = glmnet(wide_tr_mat, wide_tr_ac_mat[,5], alpha = 1, family = "gaussian", lambda = wide_fit_11$lambda.1se)
lasso_model_12 = glmnet(wide_tr_mat, wide_tr_ac_mat[,6], alpha = 1, family = "gaussian", lambda = wide_fit_12$lambda.1se)
lasso_model_13 = glmnet(wide_tr_mat, wide_tr_ac_mat[,7], alpha = 1, family = "gaussian", lambda = wide_fit_13$lambda.1se)
lasso_model_14 = glmnet(wide_tr_mat, wide_tr_ac_mat[,8], alpha = 1, family = "gaussian", lambda = wide_fit_14$lambda.1se)
lasso_model_15 = glmnet(wide_tr_mat, wide_tr_ac_mat[,9], alpha = 1, family = "gaussian", lambda = wide_fit_15$lambda.1se)
lasso_model_16 = glmnet(wide_tr_mat, wide_tr_ac_mat[,10], alpha = 1, family = "gaussian", lambda = wide_fit_16$lambda.1se)
lasso_model_17 = glmnet(wide_tr_mat, wide_tr_ac_mat[,11], alpha = 1, family = "gaussian", lambda = wide_fit_17$lambda.1se)
lasso_model_18 = glmnet(wide_tr_mat, wide_tr_ac_mat[,12], alpha = 1, family = "gaussian", lambda = wide_fit_18$lambda.1se)
lasso_model_19 = glmnet(wide_tr_mat, wide_tr_ac_mat[,13], alpha = 1, family = "gaussian", lambda = wide_fit_19$lambda.1se)
lasso_model_20 = glmnet(wide_tr_mat, wide_tr_ac_mat[,15], alpha = 1, family = "gaussian", lambda = wide_fit_20$lambda.1se)
lasso_model_21 = glmnet(wide_tr_mat, wide_tr_ac_mat[,16], alpha = 1, family = "gaussian", lambda = wide_fit_21$lambda.1se)
lasso_model_22 = glmnet(wide_tr_mat, wide_tr_ac_mat[,17], alpha = 1, family = "gaussian", lambda = wide_fit_22$lambda.1se)
lasso_model_23 = glmnet(wide_tr_mat, wide_tr_ac_mat[,18], alpha = 1, family = "gaussian", lambda = wide_fit_23$lambda.1se)
# Make predictions with lasso models and create wide format table with lasso predicted consumption
lasso_predicted = wide_pen_te_ac
lasso_predicted = lasso_predicted[, lasso_predicted_0 := predict.glmnet(lasso_model_0, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_1 := predict.glmnet(lasso_model_1, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_2 := predict.glmnet(lasso_model_2, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_3 := predict.glmnet(lasso_model_3, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_4 := predict.glmnet(lasso_model_4, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_5 := predict.glmnet(lasso_model_5, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_6 := predict.glmnet(lasso_model_6, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_7 := predict.glmnet(lasso_model_7, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_8 := predict.glmnet(lasso_model_8, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_9 := predict.glmnet(lasso_model_9, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_10 := predict.glmnet(lasso_model_10, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_11 := predict.glmnet(lasso_model_11, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_12 := predict.glmnet(lasso_model_12, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_13 := predict.glmnet(lasso_model_13, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_14 := predict.glmnet(lasso_model_14, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_15 := predict.glmnet(lasso_model_15, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_16 := predict.glmnet(lasso_model_16, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_17 := predict.glmnet(lasso_model_17, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_18 := predict.glmnet(lasso_model_18, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_19 := predict.glmnet(lasso_model_19, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_20 := predict.glmnet(lasso_model_20, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_21 := predict.glmnet(lasso_model_21, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_22 := predict.glmnet(lasso_model_22, wide_te_mat)]
lasso_predicted = lasso_predicted[, lasso_predicted_23 := predict.glmnet(lasso_model_23, wide_te_mat)]
# calculate APE for lasso predicted consumption
lasso_predicted = lasso_predicted[, APE_L0:=(abs(lasso_predicted$actual_0-lasso_predicted$lasso_predicted_0)/abs(lasso_predicted$actual_0))*100]
lasso_predicted = lasso_predicted[, APE_L1:=(abs(lasso_predicted$actual_1-lasso_predicted$lasso_predicted_1)/abs(lasso_predicted$actual_1))*100]
lasso_predicted = lasso_predicted[, APE_L2:=(abs(lasso_predicted$actual_2-lasso_predicted$lasso_predicted_2)/abs(lasso_predicted$actual_2))*100]
lasso_predicted = lasso_predicted[, APE_L3:=(abs(lasso_predicted$actual_3-lasso_predicted$lasso_predicted_3)/abs(lasso_predicted$actual_3))*100]
lasso_predicted = lasso_predicted[, APE_L4:=(abs(lasso_predicted$actual_4-lasso_predicted$lasso_predicted_4)/abs(lasso_predicted$actual_4))*100]
lasso_predicted = lasso_predicted[, APE_L5:=(abs(lasso_predicted$actual_5-lasso_predicted$lasso_predicted_5)/abs(lasso_predicted$actual_5))*100]
lasso_predicted = lasso_predicted[, APE_L6:=(abs(lasso_predicted$actual_6-lasso_predicted$lasso_predicted_6)/abs(lasso_predicted$actual_6))*100]
lasso_predicted = lasso_predicted[, APE_L7:=(abs(lasso_predicted$actual_7-lasso_predicted$lasso_predicted_7)/abs(lasso_predicted$actual_7))*100]
lasso_predicted = lasso_predicted[, APE_L8:=(abs(lasso_predicted$actual_8-lasso_predicted$lasso_predicted_8)/abs(lasso_predicted$actual_8))*100]
lasso_predicted = lasso_predicted[, APE_L9:=(abs(lasso_predicted$actual_9-lasso_predicted$lasso_predicted_9)/abs(lasso_predicted$actual_9))*100]
lasso_predicted = lasso_predicted[, APE_L10:=(abs(lasso_predicted$actual_10-lasso_predicted$lasso_predicted_10)/abs(lasso_predicted$actual_10))*100]
lasso_predicted = lasso_predicted[, APE_L11:=(abs(lasso_predicted$actual_11-lasso_predicted$lasso_predicted_11)/abs(lasso_predicted$actual_11))*100]
lasso_predicted = lasso_predicted[, APE_L12:=(abs(lasso_predicted$actual_12-lasso_predicted$lasso_predicted_12)/abs(lasso_predicted$actual_12))*100]
lasso_predicted = lasso_predicted[, APE_L13:=(abs(lasso_predicted$actual_13-lasso_predicted$lasso_predicted_13)/abs(lasso_predicted$actual_13))*100]
lasso_predicted = lasso_predicted[, APE_L14:=(abs(lasso_predicted$actual_14-lasso_predicted$lasso_predicted_14)/abs(lasso_predicted$actual_14))*100]
lasso_predicted = lasso_predicted[, APE_L15:=(abs(lasso_predicted$actual_15-lasso_predicted$lasso_predicted_15)/abs(lasso_predicted$actual_15))*100]
lasso_predicted = lasso_predicted[, APE_L16:=(abs(lasso_predicted$actual_16-lasso_predicted$lasso_predicted_16)/abs(lasso_predicted$actual_16))*100]
lasso_predicted = lasso_predicted[, APE_L17:=(abs(lasso_predicted$actual_17-lasso_predicted$lasso_predicted_17)/abs(lasso_predicted$actual_17))*100]
lasso_predicted = lasso_predicted[, APE_L18:=(abs(lasso_predicted$actual_18-lasso_predicted$lasso_predicted_18)/abs(lasso_predicted$actual_18))*100]
lasso_predicted = lasso_predicted[, APE_L19:=(abs(lasso_predicted$actual_19-lasso_predicted$lasso_predicted_19)/abs(lasso_predicted$actual_19))*100]
lasso_predicted = lasso_predicted[, APE_L20:=(abs(lasso_predicted$actual_20-lasso_predicted$lasso_predicted_20)/abs(lasso_predicted$actual_20))*100]
lasso_predicted = lasso_predicted[, APE_L21:=(abs(lasso_predicted$actual_21-lasso_predicted$lasso_predicted_21)/abs(lasso_predicted$actual_21))*100]
lasso_predicted = lasso_predicted[, APE_L22:=(abs(lasso_predicted$actual_22-lasso_predicted$lasso_predicted_22)/abs(lasso_predicted$actual_22))*100]
lasso_predicted = lasso_predicted[, APE_L23:=(abs(lasso_predicted$actual_23-lasso_predicted$lasso_predicted_23)/abs(lasso_predicted$actual_23))*100]
# calculate MAPE for lasso predicted consumption
lasso_MAPE_0 = mean(lasso_predicted$APE_L0)
lasso_MAPE_1 = mean(lasso_predicted$APE_L1)
lasso_MAPE_2 = mean(lasso_predicted$APE_L2)
lasso_MAPE_3 = mean(lasso_predicted$APE_L3)
lasso_MAPE_4 = mean(lasso_predicted$APE_L4)
lasso_MAPE_5 = mean(lasso_predicted$APE_L5)
lasso_MAPE_6 = mean(lasso_predicted$APE_L6)
lasso_MAPE_7 = mean(lasso_predicted$APE_L7)
lasso_MAPE_8 = mean(lasso_predicted$APE_L8)
lasso_MAPE_9 = mean(lasso_predicted$APE_L9)
lasso_MAPE_10 = mean(lasso_predicted$APE_L10)
lasso_MAPE_11 = mean(lasso_predicted$APE_L11)
lasso_MAPE_12 = mean(lasso_predicted$APE_L12)
lasso_MAPE_13 = mean(lasso_predicted$APE_L13)
lasso_MAPE_14 = mean(lasso_predicted$APE_L14)
lasso_MAPE_15 = mean(lasso_predicted$APE_L15)
lasso_MAPE_16 = mean(lasso_predicted$APE_L16)
lasso_MAPE_17 = mean(lasso_predicted$APE_L17)
lasso_MAPE_18 = mean(lasso_predicted$APE_L18)
lasso_MAPE_19 = mean(lasso_predicted$APE_L19)
lasso_MAPE_20 = mean(lasso_predicted$APE_L20)
lasso_MAPE_21 = mean(lasso_predicted$APE_L21)
lasso_MAPE_22 = mean(lasso_predicted$APE_L22)
lasso_MAPE_23 = mean(lasso_predicted$APE_L23)
summary(lasso_predicted[,50:73])
# Display summary statistics
summary(lasso_predicted[,50:73]) # for APE values
quantile_hour0=quantile(lasso_predicted$APE_L0, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour1=quantile(lasso_predicted$APE_L1, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour2=quantile(lasso_predicted$APE_L2, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour3=quantile(lasso_predicted$APE_L3, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour4=quantile(lasso_predicted$APE_L4, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour5=quantile(lasso_predicted$APE_L5, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour6=quantile(lasso_predicted$APE_L6, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour7=quantile(lasso_predicted$APE_L7, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour8=quantile(lasso_predicted$APE_L8, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour9=quantile(lasso_predicted$APE_L9, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour10=quantile(lasso_predicted$APE_L10, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour11=quantile(lasso_predicted$APE_L11, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour12=quantile(lasso_predicted$APE_L12, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour13=quantile(lasso_predicted$APE_L13, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour14=quantile(lasso_predicted$APE_L14, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour15=quantile(lasso_predicted$APE_L15, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour16=quantile(lasso_predicted$APE_L16, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour17=quantile(lasso_predicted$APE_L17, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour18=quantile(lasso_predicted$APE_L18, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour19=quantile(lasso_predicted$APE_L19, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour20=quantile(lasso_predicted$APE_L20, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour21=quantile(lasso_predicted$APE_L21, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour22=quantile(lasso_predicted$APE_L22, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
quantile_hour23=quantile(lasso_predicted$APE_L23, probs = c(0.1, 0.25, 0.5, 0.75, 0.9))
q_all=cbind(quantile_hour0,
quantile_hour1,
quantile_hour2,
quantile_hour3,
quantile_hour4,
quantile_hour5,
quantile_hour6,
quantile_hour7,
quantile_hour8,
quantile_hour9,
quantile_hour10,
quantile_hour11,
quantile_hour12,
quantile_hour13,
quantile_hour14,
quantile_hour15,
quantile_hour16,
quantile_hour17,
quantile_hour18,
quantile_hour19,
quantile_hour20,
quantile_hour21,
quantile_hour22,
quantile_hour23)
q_all
# Plot a boxplot for penalized APE
boxplot(lasso_predicted[,50:73])
With the penalized regression models we obtain more consistent boxplots. The error rates for the working hours are slightly more than the non-working hours. This is probably due to the comsumption difference for the working hours during the weekdays and weekends.
Side note: We are aware of that there might be a better way of writing some parts but we keep our focus on the result rather than the scripting language.