"""Load the CICIDS2017 dataset from the pickle and filter features"""
import os
import numpy as np
from MLT.tools import dataset_tools
# Where to load the dataset pickles from
CIC_FOLDER_PATH = (os.path.join(os.path.dirname(__file__), '..', 'datasets', 'CICIDS2017pub'))
[docs]def get_CIC_Top20():
"""Get the randomized Top 20 class subset identified by mutual_info_classif.
To generate these fields, call cic_feature_selection.py
"""
# These were generated / found by running cic_feature_select
# with Mutual Info Classif
fields = [0, 3, 4, 5, 9, 11, 12, 17, 19, 22, 36, 37, 38, 39, 49, 51, 54, 56, 57, 58]
return _load_cic(fields)
def get_CIC(transformed=False):
return _load_cic(transformed=transformed)
[docs]def _load_cic(columns=None, transformed=False):
"""Load an return the feature dataset as tuple.
Args:
columns (list[int] or lsit[string], optional): List of columns to keep from the full dataset
transformed (bool, optional): Whether to use a PowerTransformed version of the dataset
Returns:
data (tuple): A tuple containing train- and test-data and -labels
"""
## Data loading and prep
if transformed:
traind, trainl, testd, testl = 'cic_train_data_rand_yj', 'cic_train_labels_rand_yj', 'cic_test_data_rand_yj', 'cic_test_labels_rand_yj'
else:
traind, trainl, testd, testl = 'cic_train_data_rand', 'cic_train_labels_rand', 'cic_test_data_rand', 'cic_test_labels_rand'
# As we've pickled the encoded dataset,
# we only need to load these pickles to get the Pandas DataFrames back.
cic_train_data = dataset_tools.load_df(traind, CIC_FOLDER_PATH)
cic_train_labels = dataset_tools.load_df(trainl, CIC_FOLDER_PATH)
cic_test_data = dataset_tools.load_df(testd, CIC_FOLDER_PATH)
cic_test_labels = dataset_tools.load_df(testl, CIC_FOLDER_PATH)
if columns is not None:
if isinstance(columns[0], int):
cic_train_data = cic_train_data.iloc[:, list(columns)]
cic_test_data = cic_test_data.iloc[:, list(columns)]
elif isinstance(columns[0], str):
cic_train_data = cic_train_data.filter(columns)
cic_test_data = cic_test_data.filter(columns)
# ### Label translation
# As we are doing binary classification,
# we only need to know if the entry is normal/benign (*0*) or malicious (*1*)
# Also, by default the encoding starts with BENIGN -> 1
def translate_to_binary(label_value):
return 0 if label_value == 1 else 1
translate_to_binary = np.vectorize(translate_to_binary)
cic_train_labels = translate_to_binary(cic_train_labels['label_encoded'].values)
cic_test_labels = translate_to_binary(cic_test_labels['label_encoded'].values)
print("")
print("No of train entries:\t", len(cic_train_data))
print("No of train labels:\t", len(cic_train_labels))
print("-----------")
print("No of test entries:\t", len(cic_test_data))
print("No of test labels:\t", len(cic_test_labels))
return (cic_train_data, cic_test_data, cic_train_labels, cic_test_labels)