# pylint: disable=C0103
"""Load the NSL_KDD dataset from the pickle and filter for attributes"""
import json
import os
import numpy as np
from MLT.tools import dataset_tools
# Where to load the NSL dataset from
NSL_FOLDER_PATH = (os.path.join(os.path.dirname(__file__), '..', 'datasets', 'NSL_KDD'))
[docs]def get_NSL_6class():
"""Load the dataset, choose 6 features and binarize the labels"""
used_fields = ['duration', 'protocol_type', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
return _load_nsl(used_fields)
[docs]def get_NSL_16class():
"""Load the dataset, choose 16 features based on Iglesias & Zseby (2015) and binarize labels"""
used_fields = [
'service', 'flag', 'dst_bytes', 'wrong_fragment', 'count',
'serror_rate', 'srv_serror_rate', 'srv_rerror_rate', 'same_srv_rate',
'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate'
]
return _load_nsl(used_fields)
[docs]def _load_nsl(column_names):
"""Loads the dataset and filters for given column names
Parameters
----------
column_names : list(str)
List of column names that you want in your dataset.
Returns
-------
data : tuple
A tuple containing the filtered train- and test-data and -labels
"""
# ## Data loading and prep
# As we've pickled the encoded dataset,
# we only need to load these pickles to get the Pandas DataFrames back.
# **Hint**: If you miss the pickles, go ahead and run *pickle-NSL* or *main.py --pickeNSL*
kdd_train_data = dataset_tools.load_df('kdd_train_data', NSL_FOLDER_PATH)
kdd_test_data = dataset_tools.load_df('kdd_test_data', NSL_FOLDER_PATH)
kdd_train_labels = dataset_tools.load_df('kdd_train_labels', NSL_FOLDER_PATH)
kdd_test_labels = dataset_tools.load_df('kdd_test_labels', NSL_FOLDER_PATH)
# The paper mentions that they only use six features of the full dataset
# which is why we filter the dataframes for these.
kdd_train_data = kdd_train_data.filter(column_names)
kdd_test_data = kdd_test_data.filter(column_names)
# ### Label translation
# As we are doing binary classification,
# we only need to know if the entry is normal/benign (*0*) or malicious (*1*)
with open(os.path.join(NSL_FOLDER_PATH, 'kdd_label_wordindex.json')) as json_in:
data = json.load(json_in)
print('Loaded these labels from Tokenization process:')
print(data)
normal_index = data['normal']
def translate_to_binary(label_value):
return 0 if label_value == normal_index else 1
translate_to_binary = np.vectorize(translate_to_binary)
# We only want to know if it's benign or not, so we switch to 0 or 1
kdd_train_labels = translate_to_binary(kdd_train_labels['label_encoded'].values)
kdd_test_labels = translate_to_binary(kdd_test_labels['label_encoded'].values)
print("")
print("No of train entries:\t", len(kdd_train_data))
print("No of train labels:\t", len(kdd_train_labels))
print("-----------")
print("No of test entries:\t", len(kdd_test_data))
print("No of test labels:\t", len(kdd_test_labels))
return (kdd_train_data, kdd_test_data, kdd_train_labels, kdd_test_labels)