>>> import numpy as np >>> from scipy.stats import ttest_ind, ttest_rel >>> results_model_0 = [0]*(6000 - 5876) + [1]*5876 # 0.9793 (accuracy for our best shallow model) >>> results_model_1 = [0]*(6000 - 5910) + [1]*5910 # 0.9850 (accuracy for our best deep model) >>> ttest_rel(results_model_1, results_model_0) # same sort order; minimum standard error (largest test statistic) Ttest_relResult(statistic=5.847056157218386, pvalue=5.267034245912735e-09) >>> ttest_rel(results_model_1, results_model_0[::-1]) # opposite sort order; maximum standard error (smallest test statistic) Ttest_relResult(statistic=2.3250453293684994, pvalue=0.020102730428109634) >>> ttest_ind(results_model_1, results_model_0) Ttest_indResult(statistic=2.3455397024038467, pvalue=0.0190157522891415) >>> np.random.shuffle(results_model_0) # random pairing >>> np.random.shuffle(results_model_1) # random pairing >>> ttest_rel(results_model_1, results_model_0[::-1]) Ttest_relResult(statistic=2.3359967089478295, pvalue=0.019524182254221718)