import numpy

trn_X = numpy.loadtxt("trn_X_preprocessed.tsv", delimiter = "\t", skiprows = 1)
trn_y = numpy.loadtxt("trn_y.txt", delimiter = "\t", skiprows = 1)
tst_X = numpy.loadtxt("tst_X_preprocessed.tsv", delimiter = "\t", skiprows = 1)
tst_y = numpy.loadtxt("tst_y.txt", delimiter = "\t", skiprows = 1)

weights = []
for i in range(len(trn_y)):
    if (trn_y[i] == 0):
        weights.append(1.0)
    else:
        weights.append(13.0)

from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(score_func = chi2, k = 37)
selector.fit(trn_X, trn_y)
trn_X_subset = selector.transform(trn_X)
tst_X_subset = selector.transform(tst_X)

from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(min_samples_leaf = 10, max_depth = 6)
gbc.fit(trn_X_subset, trn_y, sample_weight = weights)
predictions = gbc.predict(tst_X_subset)

from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(tst_y, predictions)
print(confusion)

total = confusion[0,0] + confusion[0,1] + confusion[1,0] + confusion[1,1]
origcost = 2640 * total
predcost = 2640 * confusion[0,0] + (2640 + 203) * confusion[0,1] + 2640 * confusion[1,0] + 203 * confusion[1,1]
avgSavings = 1.0 * (origcost - predcost) / total
print(avgSavings)

# confusion matrix:
#               predict = 0    predict = 1
# actual = 0          3,739            131
# actual = 1            183             30

# expected savings per claim:
#     ($10,779,120 - $10,732,603) / 4083 = $11.39

# try stacking/blending