Source code for MLT.testrunners.kfold_runner

"""This runner implements the benchmark with a configurable number of k-Folds for crossvalidation"""
import os
import warnings
import json
import numpy as np
from sklearn.model_selection import KFold

from MLT.implementations import Autoencoder, HBOS, IsolationForest, LSTM_2_Multiclass, RandomForest, XGBoost

from MLT.metrics import metrics
from MLT.tools import dataset_tools, result_mail

# supress deprecation warning. sklearn is currently built against an older numpy version.
warnings.filterwarnings(
    module='sklearn*', action='ignore',
    category=DeprecationWarning,
    message='The truth value of an empty array is ambiguous'
)

[docs]def run_benchmark(candidate_data, candidate_labels, result_path, model_savepath, args): """Run the k-fold benchmark itself. Note the absence of train- and test-partitions. As this is a crossvalidation run, the test partition is not to be touched! Keyword arguments: candidate_data -- Training data with 6 features candidate_labels -- According labels for supervised learning result_path -- Where to save the results args -- Parsed CMD arguments that contain all the switches and settings """ kfold_count = args.kfolds withXGBoost = args.XGBoost withRandomForest = args.RandomForest withLSTM2 = args.LSTM2 withHBOS = args.HBOS withAutoEnc = args.AutoEncoder withIForest = args.IsolationForest xgboost_stats = [] random_forest_stats = [] lstm2_stats = [] hbos_stats = [] autoenc_stats = [] iforest_stats = [] fold_indices = {} fold_indices['short'] = {} fold_indices['full'] = {} fold_counter = 1 # k-fold Crossval. Split into k-1 training and 1 test part - repeat k times. kfold = KFold(n_splits=kfold_count) for train, test in kfold.split(candidate_data): fold_indices['short'][fold_counter] = {} fold_indices['short'][fold_counter]['test_indices'] = str(test) fold_indices['full'][fold_counter] = {} fold_indices['full'][fold_counter]['train_indices'] = list(train.tolist()) fold_indices['full'][fold_counter]['test_indices'] = list(test.tolist()) fold_train_data, fold_test_data, fold_train_labels, fold_test_labels = candidate_data[train], candidate_data[test], candidate_labels[train], candidate_labels[test] outliers_fraction = np.count_nonzero(fold_test_labels) / len(fold_test_labels) outliers_percentage = round(outliers_fraction * 100, ndigits=4) print("Outlier Percentage:", outliers_percentage) if args.anomaly: print("Anomaly/Novelty mode! Dropping all attacks from training partition.") before = dict(zip(*np.unique(fold_train_labels, return_counts=True))) print("Before:\n\tLabel Count: {}\n\tEntry Count: {}".format(before, len(fold_train_data))) fold_train_data = fold_train_data[fold_train_labels != 1] fold_train_labels = fold_train_labels[fold_train_labels != 1] print("Entry Count After: {}".format(len(fold_train_data))) if args.unsupervised: fold_train_labels = None # Pass empty train labels # now fit the models print('\nBeginning training pass {:2d}/{}'.format(fold_counter, kfold_count)) if withXGBoost: print("Training XGBoost") full_filename = os.path.join(model_savepath, "XGBoost-" + str(fold_counter)) xgb_train_pass = XGBoost.train_model( withXGBoost[0], # n_estimators withXGBoost[1], # max_depth withXGBoost[2], # learning_rate fold_train_data, fold_train_labels, fold_test_data, fold_test_labels, full_filename ) xgboost_stats.append(xgb_train_pass) if withRandomForest: print("Training Random Forest") full_filename = os.path.join(model_savepath, "RandomForest-" + str(fold_counter)) random_forest_pass = RandomForest.train_model( withRandomForest[0], # n_estimators withRandomForest[1], # max_depth fold_train_data, fold_train_labels, fold_test_data, fold_test_labels, full_filename ) random_forest_stats.append(random_forest_pass) if withLSTM2: print("Training 2-Class LSTM") tensorboard_logir = os.path.join(result_path, 'LSTM2C', str(fold_counter)) mode_savepath = os.path.join(model_savepath, 'LSTM2C-' + str(fold_counter)) lstm2_pass = LSTM_2_Multiclass.train_model( withLSTM2[0], # batch_size withLSTM2[1], # epochs withLSTM2[2], # learning_rate fold_train_data, fold_train_labels, fold_test_data, fold_test_labels, tensorboard_logir, mode_savepath ) lstm2_stats.append(lstm2_pass) if withHBOS: print("Training HBOS") full_filename = os.path.join(model_savepath, "HBOS") hbos_pass = HBOS.train_model( withHBOS[0], # n_bins withHBOS[1], # alpha withHBOS[2], # tol outliers_fraction, # contamination fold_train_data, fold_train_labels, fold_test_data, fold_test_labels, full_filename ) hbos_stats.append(hbos_pass) if withAutoEnc: print("Training AutoEncoder") full_filename = os.path.join(model_savepath, "AutoEncoder") auoenc_pass = Autoencoder.train_model( fold_train_data, fold_train_labels, fold_test_data, fold_test_labels, full_filename, batch_size=withAutoEnc[0], # batch epochs=withAutoEnc[1], # epochs dropout_rate=withAutoEnc[2], # dropout_rate contamination=outliers_fraction, # contamination learning_rate=withAutoEnc[3] # learning rate ) autoenc_stats.append(auoenc_pass) if withIForest: print("Training Isolation Forest") full_filename = os.path.join(model_savepath, "IsolationForest") iforest_pass = IsolationForest.train_model( fold_train_data, fold_train_labels, fold_test_data, fold_test_labels, full_filename, n_estimators=withIForest[0], contamination=outliers_fraction, max_features=withIForest[1], bootstrap=withIForest[2] ) iforest_stats.append(iforest_pass) fold_counter += 1 # write fold-indices to disk filepath = os.path.join(result_path, 'dataset_fold_indices.json') with open(filepath, 'w') as mj: json.dump(fold_indices, mj) try: if withXGBoost: metrics.calc_metrics_and_save_to_disk(xgboost_stats, 'XGBoost', result_path) except Exception: print('Ran into exception while saving XGBoost results to disk') try: if withRandomForest: metrics.calc_metrics_and_save_to_disk(random_forest_stats, 'RandomForest', result_path) except Exception: print('Ran into exception while saving Random Forest results to disk') try: if withLSTM2: metrics.calc_metrics_and_save_to_disk(lstm2_stats, 'LSTM2C', result_path) except Exception: print('Ran into exception while saving LSTM2C results to disk') try: if withHBOS: metrics.calc_metrics_and_save_to_disk(hbos_stats, 'HBOS', result_path) except Exception: print('Ran into exception while saving HBOS results to disk') try: if withAutoEnc: metrics.calc_metrics_and_save_to_disk(autoenc_stats, 'AutoEncoder', result_path) except Exception: print('Ran into exception while saving AutoEncoder results to disk') try: if withIForest: metrics.calc_metrics_and_save_to_disk(iforest_stats, 'IsolationForest', result_path) except Exception: print('Ran into exception while saving IsolationForest results to disk') if args.ResultMail: result_mail.prepare_and_send_results(result_path, args) full_respath = os.path.abspath(result_path) print('Results have ben saved to {}'.format(full_respath)) # Return the full path for other runners to use return full_respath