# creating some data set.seed(2^17-1) library(MASS) X_trn = rbind(mvrnorm(1000, c(3, 3), diag(2)), mvrnorm(1000, c(-3, -3), diag(2))) y_trn = rep(c(1, -1), c(1000, 1000)) plot(X_trn, col = (y_trn + 1) / 2 + 1) X_tst = rbind(mvrnorm(1000, c(3, 3), diag(2)), mvrnorm(1000, c(-3, -3), diag(2))) y_tst = rep(c(1, -1), c(1000, 1000)) # scaling (preprocessing) the data: super important for classifiers that involve measuring similarity X_trn_scaled = scale(X_trn, center = T, scale = T) X_tst_scaled = scale(X_tst, center = attr(X_trn_scaled, "scaled:center"), scale = attr(X_trn_scaled, "scaled:scale")) # interior point optimization (ipop) for solving a quadratic programming problem library(kernlab) C = 0.25 n = nrow(X_trn_scaled) c = matrix(rep(-1, n), ncol = 1) H = diag(y_trn) %*% X_trn_scaled %*% t(X_trn_scaled) %*% diag(y_trn) A = matrix(y_trn, nrow = 1) b = 0 l = matrix(rep(0, n), ncol=1) u = matrix(rep(C, n), ncol=1) r = 0 solution = ipop(c, H, A, b, l, u, r) coefficients = primal(solution) which(coefficients > 0.000001) round(coefficients[coefficients > 0.000001], 4) * y_trn[coefficients > 0.000001] # use ksvm to print the value of the objective function model = ksvm(X_trn_scaled, factor(y_trn), kernel = "vanilladot", C = C, scaled = F) model # deriving the value of the objective function coefficients = primal(solution) objective = t(c) %*% coefficients + (1/2) * (t(coefficients) %*% H %*% coefficients) objective # see equation 1 from http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf decision.values = predict(model, X_trn_scaled, type = "decision") hinge.loss = sum(pmax(0, 1 - y_trn * decision.values)) hinge.loss w = t(t(coefficients * y_trn) %*% X_trn_scaled) w (1/2) * (t(w) %*% w) + C * hinge.loss # libsvm example library(caret) library(e1071) model.linear = train(X_trn, factor(y_trn), method = "svmLinear2", trControl = trainControl(method = "repeatedcv", number = 5, repeats = 5)) plot(model.linear) model.linear$finalModel$index round(as.vector(model.linear$finalModel$coefs), 4) # prediction predictions1 = predict(model.linear$finalModel, X_tst, decision.values = T) summary(attr(predictions1, "decision.values")) coefficients = c(- model.linear$finalModel$rho, t(model.linear$finalModel$coefs) %*% model.linear$finalModel$SV) X_tst_scaled = scale(X_tst, center = model.linear$finalModel$x.scale$`scaled:center`, scale = model.linear$finalModel$x.scale$`scaled:scale`) predictions2 = cbind(rep(1, n), X_tst_scaled) %*% coefficients summary(predictions2) # example of manipulating the cost parameter: smaller cost, more support vectors X_trn = rbind(mvrnorm(500, c(1.5, 1.5), diag(2)), mvrnorm(500, c(-1.5, -1.5), diag(2))) y_trn = rep(c(1, -1), c(500, 500)) X_tst = rbind(mvrnorm(500, c(1.5, 1.5), diag(2)), mvrnorm(500, c(-1.5, -1.5), diag(2))) y_tst = rep(c(1, -1), c(500, 500)) model1 = svm(X_trn, y_trn, type = "C-classification", kernel = "linear", cost = 0.01) model1$nSV model2 = svm(X_trn, y_trn, type = "C-classification", kernel = "linear", cost = 0.1) model2$nSV model3 = svm(X_trn, y_trn, type = "C-classification", kernel = "linear", cost = 1) model3$nSV # Try out the following caret methods with the following data sets: svmPoly, svmRadial X_trn = cbind(runif(10000, -3, 3), runif(10000, -1, 5)) y_trn = as.integer((X_trn[,2] - X_trn[,1]^2) > 0) plot(X_trn, col = y_trn + 1) table(y_trn) X_trn = cbind(runif(10000, -4, 4), runif(10000, -3, 3)) y_trn = as.integer(((X_trn[,1]^2) / 9 + (X_trn[,2]^2) / 4) > 1) plot(X_trn, col = y_trn + 1) table(y_trn) # simple regression example from the "?svm" page x_trn = seq(0.10, 5.00, by = 0.05) n_trn = length(x_trn) y_trn = log(x_trn) + rnorm(n_trn, sd = 0.20) model = svm(x_trn, y_trn) x_tst = seq(0.10, 5.00, by = 0.05) n_tst = length(x_tst) y_tst = log(x_tst) + rnorm(n_tst, sd = 0.20) predictions3 = predict(model, x_tst) summary(predictions3) # simple implementation for svm regression prediction x_tst_scaled = scale(x_tst, center = model$x.scale$`scaled:center`, scale = model$x.scale$`scaled:scale`) predictions4 = array(0, n_tst) for (i in 1:n_tst) { predictions4[i] = model$y.scale$`scaled:scale` * (sum(model$coefs * exp(- ((x_tst_scaled[i] - model$SV)^2))) - model$rho) + model$y.scale$`scaled:center` } summary(predictions4) # example code for the kaggle exercise set.seed(2^17-1) setwd("C:/Data/Reuters") library(e1071) trn = read.matrix.csr("trn.dat") tst = read.matrix.csr("tst.dat") dim(trn$x) object.size(trn$x) # 18,069,496 bytes is much less than 7,649,208,896 bytes (20,242 * 47,236 * 8) y = factor(trn$y, levels = c(0, 1), labels = c("no", "yes")) model.svm = svm(trn$x, y, kernel = "linear", probability = T) predictions.svm = predict(model.svm, newdata = tst$x, probability = T) probabilities.svm = attributes(predictions.svm)$probabilities[,"yes"] library(xgboost) trn = xgb.DMatrix("trn.dat") tst = xgb.DMatrix("tst.dat") model.xgb = xgboost(trn, params = list(objective = "binary:logistic", eta = 0.1, max.depth = 16, eval_metric = "auc"), nround = 100) predictions.xgb = predict(model.xgb, tst) probabilities.xgb = predictions.xgb predictions = probabilities.svm output = data.frame(Index = 1:length(predictions), Prediction = predictions) write.csv(output, "predictions.csv", quote=F, row.names = F)