Source code for MLT.datasets.NSL

# pylint: disable=C0103
"""Load the NSL_KDD dataset from the pickle and filter for attributes"""
import json
import os
import numpy as np
from MLT.tools import dataset_tools
# Where to load the NSL dataset from
NSL_FOLDER_PATH = (os.path.join(os.path.dirname(__file__), '..', 'datasets', 'NSL_KDD'))


[docs]def get_NSL_6class(): """Load the dataset, choose 6 features and binarize the labels""" used_fields = ['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'count', 'srv_count'] return _load_nsl(used_fields)
[docs]def get_NSL_16class(): """Load the dataset, choose 16 features based on Iglesias & Zseby (2015) and binarize labels""" used_fields = [ 'service', 'flag', 'dst_bytes', 'wrong_fragment', 'count', 'serror_rate', 'srv_serror_rate', 'srv_rerror_rate', 'same_srv_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate' ] return _load_nsl(used_fields)
[docs]def _load_nsl(column_names): """Loads the dataset and filters for given column names Parameters ---------- column_names : list(str) List of column names that you want in your dataset. Returns ------- data : tuple A tuple containing the filtered train- and test-data and -labels """ # ## Data loading and prep # As we've pickled the encoded dataset, # we only need to load these pickles to get the Pandas DataFrames back. # **Hint**: If you miss the pickles, go ahead and run *pickle-NSL* or *main.py --pickeNSL* kdd_train_data = dataset_tools.load_df('kdd_train_data', NSL_FOLDER_PATH) kdd_test_data = dataset_tools.load_df('kdd_test_data', NSL_FOLDER_PATH) kdd_train_labels = dataset_tools.load_df('kdd_train_labels', NSL_FOLDER_PATH) kdd_test_labels = dataset_tools.load_df('kdd_test_labels', NSL_FOLDER_PATH) # The paper mentions that they only use six features of the full dataset # which is why we filter the dataframes for these. kdd_train_data = kdd_train_data.filter(column_names) kdd_test_data = kdd_test_data.filter(column_names) # ### Label translation # As we are doing binary classification, # we only need to know if the entry is normal/benign (*0*) or malicious (*1*) with open(os.path.join(NSL_FOLDER_PATH, 'kdd_label_wordindex.json')) as json_in: data = json.load(json_in) print('Loaded these labels from Tokenization process:') print(data) normal_index = data['normal'] def translate_to_binary(label_value): return 0 if label_value == normal_index else 1 translate_to_binary = np.vectorize(translate_to_binary) # We only want to know if it's benign or not, so we switch to 0 or 1 kdd_train_labels = translate_to_binary(kdd_train_labels['label_encoded'].values) kdd_test_labels = translate_to_binary(kdd_test_labels['label_encoded'].values) print("") print("No of train entries:\t", len(kdd_train_data)) print("No of train labels:\t", len(kdd_train_labels)) print("-----------") print("No of test entries:\t", len(kdd_test_data)) print("No of test labels:\t", len(kdd_test_labels)) return (kdd_train_data, kdd_test_data, kdd_train_labels, kdd_test_labels)