Source code for MLT.metrics.metrics_roc

"""Utility functions for generating ROC and AUC statistics"""
import os
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import auc
from scipy import interp
import numpy as np
import matplotlib.pyplot as plt

from tools import toolbelt


[docs]def calc_auc(prediction_data): """Calculated the area under curve on given DF""" all_auc = [] for pred in prediction_data: all_auc.append(roc_auc_score(pred.test_labels, pred.predicted_probabilities)) return all_auc
# This is a slightly adapted version of # http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
[docs]def generate_avg_roc_to_disk(prediction_data, modelname, filepath): """Generates an average of all given ROCs and plots all ROCs and Avg to a single figure""" tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) i = 1 for pred in prediction_data: fpr, tpr, tresholds = roc_curve(pred.test_labels, pred.predicted_probabilities) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='Fold %d (AUC: %0.2f)' % (i, roc_auc)) i += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC: %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve for {}'.format(modelname)) plt.legend(loc="lower right") savepath = os.path.join(filepath, modelname + '.png') plt.savefig(savepath) plt.clf() plt.close('all')
[docs]def append_roc_model_selection(result_json, modelname, line_format): """Appends the CV-mean ROC to an existing plot.""" tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) for pred in result_json: fpr, tpr, tresholds = roc_curve(pred["test_labels"], pred["predicted_probabilities"]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, line_format, markevery=5, # only plot every 5th marker to not overcrowd the graph label='%s\n (AUC: %0.2f $\pm$ %0.2f)' % (modelname, mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
[docs]def generate_cv_roc_model_selection(modelname, result_path, parameter_name, model_id_list=None, format_list=None): """Generates an average of all CV-results in a given folder. Point this function to a folder that contains multiple result-subfolders with crossvalidated results. It will generate the average ROC for every result and add them all to a single figure. Args: modelname (string): The name of the model to draw. Will be used to determine filename and title of the plot. result_path (string or list): Path to the result base folder that contains multiple test runs. Can be a list of single runs. All runs will be combined into a single figure. parameter_name (string): Parameter under test - will be in the title and appended to the filename. model_id_list (list): A list of Strings. This is used for the legend. format_list (list): A list of pyplot format Strings to be used for the single plots. """ if len(result_path) > 1: subfolders = result_path result_path = os.path.join(subfolders[0], '..') else: result_path = result_path[0] subfolders = toolbelt.list_folders(result_path) no_of_subs = len(subfolders) # define a list of line style format strings for the different plots. # See https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html#matplotlib.pyplot.plot if format_list is None or len(format_list) < no_of_subs: print('Format not provided or not enough entries. Falling back to default\nRemember to clean old results!') format_list = ['ks--', 'kv--', 'k<--', 'kp--', 'kx--', 'kd--', 'rs--', 'rv--', 'r<--', 'rp--', 'rx--', 'rd--'] if model_id_list is None or len(model_id_list) < no_of_subs: print('Model names not provided or not enough entries. Falling back to default\nRemember to clean old results!') model_id_list = [] for _ in range(20): model_id_list.append(modelname) # Just use the provided modelname # Clear all remaining plots plt.clf() plt.close('all') # add luck plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) # loop over subfolders and generate the ROCs for idx, subfolder in enumerate(subfolders): cv_runpath = os.path.join(result_path, subfolder) cv_results = toolbelt.load_results_from_disk(cv_runpath, modelname) print("Loaded runpath: {}".format(cv_runpath)) append_roc_model_selection(cv_results, model_id_list[idx], format_list[idx]) # set main legend and info # Legend placement: https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot/43439132#43439132 plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('{} ({})'.format(modelname, parameter_name)) plt.legend(loc="lower right", ncol=2, prop={'size': 8}) # save to disk folder_name = (os.path.basename(os.path.realpath(result_path))) savepath = os.path.join(result_path, folder_name + '-' + modelname + '_' + parameter_name + '.pdf') plt.savefig(savepath, bbox_inches="tight", format="pdf") #Finally, clean again plt.clf() plt.close('all')