# creating some data
set.seed(2^17-1)

library(MASS)
X_trn = rbind(mvrnorm(1000, c(3, 3), diag(2)),
              mvrnorm(1000, c(-3, -3), diag(2)))
y_trn = rep(c(1, -1), c(1000, 1000))
plot(X_trn, col = (y_trn + 1) / 2 + 1)
X_tst = rbind(mvrnorm(1000, c(3, 3), diag(2)),
              mvrnorm(1000, c(-3, -3), diag(2)))
y_tst = rep(c(1, -1), c(1000, 1000))


# scaling (preprocessing) the data: super important for classifiers that involve measuring similarity
X_trn_scaled = scale(X_trn, center = T, scale = T)
X_tst_scaled = scale(X_tst, center = attr(X_trn_scaled, "scaled:center"), scale = attr(X_trn_scaled, "scaled:scale"))


# interior point optimization (ipop) for solving a quadratic programming problem
library(kernlab)
C = 0.25
n = nrow(X_trn_scaled)
c = matrix(rep(-1, n), ncol = 1)
H = diag(y_trn) %*% X_trn_scaled %*% t(X_trn_scaled) %*% diag(y_trn)
A = matrix(y_trn, nrow = 1)
b = 0
l = matrix(rep(0, n), ncol=1)
u = matrix(rep(C, n), ncol=1)
r = 0
solution = ipop(c, H, A, b, l, u, r)
coefficients = primal(solution)
which(coefficients > 0.000001)
round(coefficients[coefficients > 0.000001], 4) * y_trn[coefficients > 0.000001]


# use ksvm to print the value of the objective function
model = ksvm(X_trn_scaled, factor(y_trn), kernel = "vanilladot", C = C, scaled = F)
model


# deriving the value of the objective function
coefficients = primal(solution)
objective = t(c) %*% coefficients + (1/2) * (t(coefficients) %*% H %*% coefficients)
objective


# see equation 1 from http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf
decision.values = predict(model, X_trn_scaled, type = "decision")
hinge.loss = sum(pmax(0, 1 - y_trn * decision.values))
hinge.loss
w = t(t(coefficients * y_trn) %*% X_trn_scaled)
w
(1/2) * (t(w) %*% w) + C * hinge.loss


# libsvm example
library(caret)
library(e1071)
model.linear = train(X_trn, factor(y_trn), method = "svmLinear2", trControl = trainControl(method = "repeatedcv", number = 5, repeats = 5))
plot(model.linear)
model.linear$finalModel$index
round(as.vector(model.linear$finalModel$coefs), 4)


# prediction
predictions1 = predict(model.linear$finalModel, X_tst, decision.values = T)
summary(attr(predictions1, "decision.values"))
coefficients = c(- model.linear$finalModel$rho, t(model.linear$finalModel$coefs) %*% model.linear$finalModel$SV)
X_tst_scaled = scale(X_tst, center = model.linear$finalModel$x.scale$`scaled:center`, scale = model.linear$finalModel$x.scale$`scaled:scale`)
predictions2 = cbind(rep(1, n), X_tst_scaled) %*% coefficients
summary(predictions2)


# example of manipulating the cost parameter: smaller cost, more support vectors
X_trn = rbind(mvrnorm(500, c(1.5, 1.5), diag(2)),
              mvrnorm(500, c(-1.5, -1.5), diag(2)))
y_trn = rep(c(1, -1), c(500, 500))

X_tst = rbind(mvrnorm(500, c(1.5, 1.5), diag(2)),
              mvrnorm(500, c(-1.5, -1.5), diag(2)))
y_tst = rep(c(1, -1), c(500, 500))

model1 = svm(X_trn, y_trn, type = "C-classification", kernel = "linear", cost = 0.01)
model1$nSV
model2 = svm(X_trn, y_trn, type = "C-classification", kernel = "linear", cost = 0.1)
model2$nSV
model3 = svm(X_trn, y_trn, type = "C-classification", kernel = "linear", cost = 1)
model3$nSV


# Try out the following caret methods with the following data sets: svmPoly, svmRadial
X_trn = cbind(runif(10000, -3, 3), runif(10000, -1, 5))
y_trn = as.integer((X_trn[,2] - X_trn[,1]^2) > 0)
plot(X_trn, col = y_trn + 1)
table(y_trn)

X_trn = cbind(runif(10000, -4, 4), runif(10000, -3, 3))
y_trn = as.integer(((X_trn[,1]^2) / 9 + (X_trn[,2]^2) / 4) > 1)
plot(X_trn, col = y_trn + 1)
table(y_trn)


# simple regression example from the "?svm" page
x_trn = seq(0.10, 5.00, by = 0.05)
n_trn = length(x_trn)
y_trn = log(x_trn) + rnorm(n_trn, sd = 0.20)
model = svm(x_trn, y_trn)
x_tst = seq(0.10, 5.00, by = 0.05)
n_tst = length(x_tst)
y_tst = log(x_tst) + rnorm(n_tst, sd = 0.20)
predictions3 = predict(model, x_tst)
summary(predictions3)

# simple implementation for svm regression prediction
x_tst_scaled = scale(x_tst, center = model$x.scale$`scaled:center`, scale = model$x.scale$`scaled:scale`)
predictions4 = array(0, n_tst)
for (i in 1:n_tst) {
    predictions4[i] = model$y.scale$`scaled:scale` * (sum(model$coefs * exp(- ((x_tst_scaled[i] - model$SV)^2))) - model$rho) + model$y.scale$`scaled:center`
}
summary(predictions4)


# example code for the kaggle exercise
set.seed(2^17-1)
setwd("C:/Data/Reuters")

library(e1071)
trn = read.matrix.csr("trn.dat")
tst = read.matrix.csr("tst.dat")
dim(trn$x)
object.size(trn$x)
# 18,069,496 bytes is much less than 7,649,208,896 bytes (20,242 * 47,236 * 8)
y = factor(trn$y, levels = c(0, 1), labels = c("no", "yes"))
model.svm = svm(trn$x, y, kernel = "linear", probability = T)
predictions.svm = predict(model.svm, newdata = tst$x, probability = T)
probabilities.svm = attributes(predictions.svm)$probabilities[,"yes"]

library(xgboost)
trn = xgb.DMatrix("trn.dat")
tst = xgb.DMatrix("tst.dat")
model.xgb = xgboost(trn, params = list(objective = "binary:logistic", eta = 0.1, max.depth = 16, eval_metric = "auc"), nround = 100)
predictions.xgb = predict(model.xgb, tst)
probabilities.xgb = predictions.xgb

predictions = probabilities.svm
output = data.frame(Index = 1:length(predictions), Prediction = predictions)
write.csv(output, "predictions.csv", quote=F, row.names = F)