# principal component analysis example library(MASS) X = mvrnorm(1000, c(3, 3), matrix(c(9, 4.5, 4.5, 4), nrow=2)) plot(X) prcomp.output = prcomp(X) prcomp.output t(prcomp.output$rotation[,1]) %*% prcomp.output$rotation[,2] sum(prcomp.output$rotation[,1]^2) sum(prcomp.output$rotation[,2]^2) X.centered = scale(X, center = T, scale = F) svd.output = svd(X.centered) svd.output$v svd.output$d / sqrt(max(1, nrow(X.centered) - 1)) X.centered[1:3,] (svd.output$u %*% diag(svd.output$d) %*% t(svd.output$v))[1:3,] eigen(cov(X.centered)) prcomp.output$sdev^2 var(X.centered[,1]) + var(X.centered[,2]) sum(prcomp.output$sdev^2) prcomp.output$sdev^2 / sum(prcomp.output$sdev^2) # projection: using both principal components predictions1 = predict(prcomp.output, X) predictions2 = X.centered %*% prcomp.output$rotation predictions1[1:3,] predictions2[1:3,] reconstruction = predictions2 %*% t(prcomp.output$rotation) X.centered[1:3,] reconstruction[1:3,] # perfect reconstruction: all principal components # projection: using only one principal component predictions3 = X.centered %*% prcomp.output$rotation[,1] reconstruction = predictions3 %*% t(prcomp.output$rotation[,1]) X.centered[1:3,] reconstruction[1:3,] # imperfect reconstruction: only one principal component # simple hierarchical clustering example X = matrix(c(-0.50, -1.00, 0.00, -0.80, -1.50, -0.40, -1.40, -1.50, 1.20, -0.25, -1.00, -1.10, 1.40, 0.00, 0.50, -0.10, 0.00, 0.90), nrow = 9, byrow = T) model.hclust = hclust(dist(X)) dist(X) # euclidean distance cophenetic(hclust(dist(X), "complete")) # based on max distance cor(dist(X), cophenetic(hclust(dist(X), "complete"))) cor(dist(X), cophenetic(hclust(dist(X), "average"))) # k means clustering example # copy and paste a dozen times: checking for goofy coloring of the clusters X = rbind( mvrnorm(1000, c( 4, 4), matrix(c(1, 0.9, 0.9, 2), nrow = 2, byrow = T)), mvrnorm(1000, c( 4, -4), matrix(c(1, -0.9, -0.9, 2), nrow = 2, byrow = T)), mvrnorm(1000, c(-4, -4), matrix(c(1, 0.9, 0.9, 2), nrow = 2, byrow = T)), mvrnorm(1000, c(-4, 4), matrix(c(1, -0.9, -0.9, 2), nrow = 2, byrow = T)) ) plot(X) model = kmeans(X, 4) plot(X, col = model$cluster) # hierarchical clustering to initialize the k means model cluster = cutree(hclust(dist(X), method = "average"), k = 4) centers = cbind( tapply(X[,1], cluster, mean), tapply(X[,2], cluster, mean) ) model = kmeans(X, centers) plot(X, col = model$cluster) X = rbind( mvrnorm(1000, c( 3, 3), matrix(c(1, 0, 0, 1), nrow = 2, byrow = T)), mvrnorm(1000, c(-3, -3), matrix(c(1, 0, 0, 1), nrow = 2, byrow = T)) ) plot(X) model = kmeans(X, 2) names(model) model$totss grand.mean = colMeans(X) sum((X[,1] - grand.mean[1])^2) + sum((X[,2] - grand.mean[2])^2) model$betweenss sum(model$cluster == 1) * (((model$centers[1,1] - grand.mean[1])^2) + ((model$centers[1,2] - grand.mean[2])^2)) + sum(model$cluster == 2) * (((model$centers[2,1] - grand.mean[1])^2) + ((model$centers[2,2] - grand.mean[2])^2)) model$withinss model$tot.withinss sum((X[model$cluster == 1,1] - model$centers[1,1])^2) + sum((X[model$cluster == 1,2] - model$centers[1,2])^2) + sum((X[model$cluster == 2,1] - model$centers[2,1])^2) + sum((X[model$cluster == 2,2] - model$centers[2,2])^2) model$totss model$betweenss + model$tot.withinss # clustering using Expectation Maximization (EM) # with multivariate Gaussian distribution parameters set.seed(2^17-1) library(MASS) X1 = mvrnorm(1000, c(-6, 6), matrix(c(4,  4.5,  4.5, 9), nrow=2, byrow=T)) X2 = mvrnorm(1000, c( 6, 6), matrix(c(4, -4.5, -4.5, 9), nrow=2, byrow=T)) X3 = mvrnorm(1000, c( 0,-6), matrix(c(2, 0, 0, 2), nrow=2, byrow=T)) X = rbind(X1, X2, X3) plot(X) library(mclust) model = Mclust(X) names(model) names(model$parameters) names(model$parameters$variance) model$parameters$pro model$parameters$mean model$parameters$variance$sigma # example code for the kaggle exercise set.seed(2^17-1) setwd("C:/Data/Insurance") colClasses = rep("numeric", 85) colClasses[1] = "factor" colClasses[5] = "factor" trn_X = read.table("trn_X.tsv") trn_y = factor(scan("trn_y.txt"), levels = c(0, 1), labels = c("no", "yes")) tst_X = read.table("tst_X.tsv") library(randomForest) model = randomForest(trn_X, trn_y) predictions = predict(model, tst_X, type = "prob")[,2] output = data.frame(Id = 1:length(predictions), Predicted = predictions) write.csv(output, "predictions.csv", quote=F, row.names = F)