"""Utility functions for generating ROC and AUC statistics"""
import os
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import auc
from scipy import interp
import numpy as np
import matplotlib.pyplot as plt
from tools import toolbelt
[docs]def calc_auc(prediction_data):
"""Calculated the area under curve on given DF"""
all_auc = []
for pred in prediction_data:
all_auc.append(roc_auc_score(pred.test_labels, pred.predicted_probabilities))
return all_auc
# This is a slightly adapted version of
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
[docs]def generate_avg_roc_to_disk(prediction_data, modelname, filepath):
"""Generates an average of all given ROCs and plots all ROCs and Avg to a single figure"""
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 1
for pred in prediction_data:
fpr, tpr, tresholds = roc_curve(pred.test_labels, pred.predicted_probabilities)
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3, label='Fold %d (AUC: %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC: %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for {}'.format(modelname))
plt.legend(loc="lower right")
savepath = os.path.join(filepath, modelname + '.png')
plt.savefig(savepath)
plt.clf()
plt.close('all')
[docs]def append_roc_model_selection(result_json, modelname, line_format):
"""Appends the CV-mean ROC to an existing plot."""
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
for pred in result_json:
fpr, tpr, tresholds = roc_curve(pred["test_labels"], pred["predicted_probabilities"])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, line_format,
markevery=5, # only plot every 5th marker to not overcrowd the graph
label='%s\n (AUC: %0.2f $\pm$ %0.2f)' % (modelname, mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
[docs]def generate_cv_roc_model_selection(modelname, result_path, parameter_name, model_id_list=None, format_list=None):
"""Generates an average of all CV-results in a given folder.
Point this function to a folder that contains multiple result-subfolders with crossvalidated results.
It will generate the average ROC for every result and add them all to a single figure.
Args:
modelname (string): The name of the model to draw. Will be used to determine filename and title of the plot.
result_path (string or list): Path to the result base folder that contains multiple test runs. Can be a list of single runs. All runs will be combined into a single figure.
parameter_name (string): Parameter under test - will be in the title and appended to the filename.
model_id_list (list): A list of Strings. This is used for the legend.
format_list (list): A list of pyplot format Strings to be used for the single plots.
"""
if len(result_path) > 1:
subfolders = result_path
result_path = os.path.join(subfolders[0], '..')
else:
result_path = result_path[0]
subfolders = toolbelt.list_folders(result_path)
no_of_subs = len(subfolders)
# define a list of line style format strings for the different plots.
# See https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html#matplotlib.pyplot.plot
if format_list is None or len(format_list) < no_of_subs:
print('Format not provided or not enough entries. Falling back to default\nRemember to clean old results!')
format_list = ['ks--', 'kv--', 'k<--', 'kp--', 'kx--', 'kd--', 'rs--', 'rv--', 'r<--', 'rp--', 'rx--', 'rd--']
if model_id_list is None or len(model_id_list) < no_of_subs:
print('Model names not provided or not enough entries. Falling back to default\nRemember to clean old results!')
model_id_list = []
for _ in range(20):
model_id_list.append(modelname) # Just use the provided modelname
# Clear all remaining plots
plt.clf()
plt.close('all')
# add luck
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)
# loop over subfolders and generate the ROCs
for idx, subfolder in enumerate(subfolders):
cv_runpath = os.path.join(result_path, subfolder)
cv_results = toolbelt.load_results_from_disk(cv_runpath, modelname)
print("Loaded runpath: {}".format(cv_runpath))
append_roc_model_selection(cv_results, model_id_list[idx], format_list[idx])
# set main legend and info
# Legend placement: https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot/43439132#43439132
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('{} ({})'.format(modelname, parameter_name))
plt.legend(loc="lower right", ncol=2, prop={'size': 8})
# save to disk
folder_name = (os.path.basename(os.path.realpath(result_path)))
savepath = os.path.join(result_path, folder_name + '-' + modelname + '_' + parameter_name + '.pdf')
plt.savefig(savepath, bbox_inches="tight", format="pdf")
#Finally, clean again
plt.clf()
plt.close('all')