# AIC versus BIC Credit = read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv", row.names = 1) model = lm(Balance ~ Cards + Income + Student + Limit, data = Credit) AIC(model) BIC(model) residuals = model$residuals n = length(residuals) weights = rep.int(1, n) logLik(model) log.likelihood = 0.5 * (sum(log(weights)) - n * (log(2 * pi) + 1 - log(n) + log(sum(weights * (residuals^2))))) log.likelihood df = model$rank + 1 # +1 for dispersion parameter df -2 * log.likelihood + 2 * df -2 * log.likelihood + log(n) * df # Principal Components Analysis (PCA): # generating new predictors for regression library(MASS) X = mvrnorm(1000, c(3, 3), matrix(c(9, 4.5, 4.5, 4), nrow=2)) plot(X) prcomp.output = prcomp(X) prcomp.output X.centered = scale(X, center = T, scale = F) svd.output = svd(X.centered) svd.output$v svd.output$d / sqrt(max(1, nrow(X.centered) - 1)) X.centered[1:3,] (svd.output$u %*% diag(svd.output$d) %*% t(svd.output$v))[1:3,] t(prcomp.output$rotation[,1]) %*% prcomp.output$rotation[,2] sum(prcomp.output$rotation[,1]^2) # Partial Least Squares (PLS): # generating new predictors for regression Credit = read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv", row.names = 1) X = Credit[,1:6] y = Credit[,11] X = scale(X, center = T, scale = T) y = scale(y, center = T, scale = T) library(chemometrics) m = 3 pls1_nipals(X, y, m)$T[1:3,] New.Predictors = NULL for (i in 1:m) { # where m is the number of new predictors weights = t(X) %*% y weights = weights / sqrt((t(weights) %*% weights)[1,1]) new.predictor = X %*% weights # lm(X[,j] ~ new.predictor)$coefficients[2] p = (t(X) %*% new.predictor) / (t(new.predictor) %*% new.predictor)[1,1] # lm(y ~ new.predictor)$coefficients[2] coefficient = (t(new.predictor) %*% y) / (t(new.predictor) %*% new.predictor)[1,1] X = X - new.predictor %*% t(p) y = y - new.predictor %*% coefficient New.Predictors = cbind(New.Predictors, new.predictor) } New.Predictors[1:3,] ################################################################################ # linear regression example for estimating twitter buzz setwd("C:/Data/twitter") set.seed(2^17-1) start.time = Sys.time() trn_X = read.csv("trn_X.csv", header = F) trn_y = scan("trn_y.txt") tst_X = read.csv("tst_X.csv", header = F) trn = data.frame(y = trn_y, X = trn_X) tst = data.frame(X = tst_X) model = glm(y ~ ., data = trn, family = gaussian) predictions = predict(model, newdata = tst, typ = "response") output = data.frame(Id = 1:length(predictions), Prediction = predictions) write.csv(output, "predictions.csv", quote=F, row.names = F) Sys.time() - start.time summary(model)