#================================================================================================================# #========== Jacek Wallusch ==========# #========== MBA: Introduction to Pricing Analytics ==========# #========== OLS Regression - Estimation, Significance, and Fit ==========# #================================================================================================================# # Data: Used car prices # Data courtesy: Allegro, Audi Q5 (filters: accident-free, Diesel engine, drivetrain full-time AWD, # registered in Poland, undamaged, available for auction on October 2016 # Libraries: normwhn.test, plotly, lmtest #----------------------------------------------------------------------------------------------------------------- # IMPORT THE DATA in .csv-format (change the location of your file!): audiq5 <- read.csv("C:/Users/SESA387551/Documents/Lehre/Wintersemester-2016/audiq5.csv", header=TRUE, sep = ";") #----------------------------------------------------------------------------------------------------------------- # INITIATE THE PACKAGES library(normwhn.test) library(lmtest) library(car) #----------------------------------------------------------------------------------------------------------------- # WRITE THE VARIABLES, CREATE DUMMIES q5_year <- audiq5[,1] q5_encc <- audiq5[,2] q5_mila <- audiq5[,3] q5_pric <- audiq5[,4] q5_ccde <- audiq5[,5] # dummies for age and mileage q5_y_du <- 2016 - q5_year q5_m_du <- q5_mila/10000 # merge the engine capacity and last column q5_ccdu <- paste(q5_encc, q5_ccde, sep = "") #----------------------------------------------------------------------------------------------------------------- # FIRST STEP: Data Visualisation plot_ly(x = q5_mila, y = q5_pric, color = q5_ccdu, colors = c("dark green", "dark red"), # colour grouping: color of the bubble depends on encc type = "scatter", mode = "markers")%>% layout(title = "Price, Mileage, and Engine Displacement (colour)
of Audi Q5 Offered at Allegro", # command
breaks the line margin = list(l = 150, r = 150, b = 150, t = 150, pad = 1), xaxis = list(title = "Mileage in kilometers"), yaxis = list(title = "Price")) #----------------------------------------------------------------------------------------------------------------- # OLS Estimations # first model q5_ols_01 <- lm(q5_pric ~ q5_year + q5_mila) summary(q5_ols_01) # re-estimate the model q5_ols_02 <- lm(q5_pric ~ q5_y_du + q5_mila) summary(q5_ols_02) # re-estimate the model again q5_ols_03 <- lm(q5_pric ~ q5_y_du + q5_m_du) summary(q5_ols_03) #----------------------------------------------------------------------------------------------------------------- # OLS Estimations with No Intercept q5_ols_04 <- lm(q5_pric ~ q5_y_du + q5_mila - 1) #----------------------------------------------------------------------------------------------------------------- # OLS Estimations: GOODNESS-OF-FIT # Logarithm of likelihood logLik(q5_ols_03) # Akaike Information Criterion extractAIC(q5_ols_03) # Schwarz Bayesian Information Criterion T_res_q5 <- NROW(residuals(q5_ols_03)) extractAIC(q5_ols_03, k = log(T_res_q5)) #----------------------------------------------------------------------------------------------------------------- # OLS Estimations: DIAGNOSTICS # Residuals: reg_04_resids <- q5_ols_03$residuals # Histogram and normal distribution norm_resids_graph <- plot_ly(x = hist(reg_04_resids)$mid, y = hist(reg_04_resids)$density, type = "bar", name = "Empirical Density")%>% layout(title = "Empirical Distribution of Residuals
vs. Normal Distribution", margin = list(l = 150, r = 150, b = 150, t = 150, pad = 1)) norm_resids_graph <- add_trace(norm_resids_graph, x = hist(reg_04_resids)$mid, y = dnorm(hist(reg_04_resids)$mid, mean = mean(reg_04_resids), sd = sd(reg_04_resids)), type = "scatter", mode = "lines", name = "Normal Distribution") norm_resids_graph # Testing for normality of (residuals) distribution - insert the residuals as matrix # Univariate Doornik-Hansen Test normality.test1(as.matrix(reg_04_resids)) # Testing for homoskedasticity # Breusch-Pagan test for non-studentised residuals bptest(q5_ols_04, studentize = FALSE) # Checking for multicollinearity # Variance Inflation Factor (VIF) vif(q5_ols_04)