import numpy trn_X = numpy.loadtxt("trn_X_preprocessed.tsv", delimiter = "\t", skiprows = 1) trn_y = numpy.loadtxt("trn_y.txt", delimiter = "\t", skiprows = 1) tst_X = numpy.loadtxt("tst_X_preprocessed.tsv", delimiter = "\t", skiprows = 1) tst_y = numpy.loadtxt("tst_y.txt", delimiter = "\t", skiprows = 1) weights = [] for i in range(len(trn_y)): if (trn_y[i] == 0): weights.append(1.0) else: weights.append(13.0) from sklearn.feature_selection import SelectKBest, chi2 selector = SelectKBest(score_func = chi2, k = 37) selector.fit(trn_X, trn_y) trn_X_subset = selector.transform(trn_X) tst_X_subset = selector.transform(tst_X) from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(min_samples_leaf = 10, max_depth = 6) gbc.fit(trn_X_subset, trn_y, sample_weight = weights) predictions = gbc.predict(tst_X_subset) from sklearn.metrics import confusion_matrix confusion = confusion_matrix(tst_y, predictions) print(confusion) total = confusion[0,0] + confusion[0,1] + confusion[1,0] + confusion[1,1] origcost = 2640 * total predcost = 2640 * confusion[0,0] + (2640 + 203) * confusion[0,1] + 2640 * confusion[1,0] + 203 * confusion[1,1] avgSavings = 1.0 * (origcost - predcost) / total print(avgSavings) # confusion matrix: # predict = 0 predict = 1 # actual = 0 3,739 131 # actual = 1 183 30 # expected savings per claim: # ($10,779,120 - $10,732,603) / 4083 = $11.39 # try stacking/blending