Source code for MLT.datasets.CIC

"""Load the CICIDS2017 dataset from the pickle and filter features"""
import os
import numpy as np
from MLT.tools import dataset_tools

# Where to load the dataset pickles from
CIC_FOLDER_PATH = (os.path.join(os.path.dirname(__file__), '..', 'datasets', 'CICIDS2017pub'))

[docs]def get_CIC_Top20():
    """Get the randomized Top 20 class subset identified by mutual_info_classif.

    To generate these fields, call cic_feature_selection.py
    """
    # These were generated / found by running cic_feature_select
    # with Mutual Info Classif
    fields = [0, 3, 4, 5, 9, 11, 12, 17, 19, 22, 36, 37, 38, 39, 49, 51, 54, 56, 57, 58]
    return _load_cic(fields)

def get_CIC(transformed=False):
    return _load_cic(transformed=transformed)


[docs]def _load_cic(columns=None, transformed=False):
    """Load an return the feature dataset as tuple.

    Args:
        columns (list[int] or lsit[string], optional): List of columns to keep from the full dataset
        transformed (bool, optional): Whether to use a PowerTransformed version of the dataset

    Returns:
        data (tuple): A tuple containing train- and test-data and -labels
    """
    ## Data loading and prep
    if transformed:
        traind, trainl, testd, testl = 'cic_train_data_rand_yj', 'cic_train_labels_rand_yj', 'cic_test_data_rand_yj', 'cic_test_labels_rand_yj'
    else:
        traind, trainl, testd, testl = 'cic_train_data_rand', 'cic_train_labels_rand', 'cic_test_data_rand', 'cic_test_labels_rand'
    # As we've pickled the encoded dataset,
    # we only need to load these pickles to get the Pandas DataFrames back.
    cic_train_data = dataset_tools.load_df(traind, CIC_FOLDER_PATH)
    cic_train_labels = dataset_tools.load_df(trainl, CIC_FOLDER_PATH)
    cic_test_data = dataset_tools.load_df(testd, CIC_FOLDER_PATH)
    cic_test_labels = dataset_tools.load_df(testl, CIC_FOLDER_PATH)

    if columns is not None:
        if isinstance(columns[0], int):
            cic_train_data = cic_train_data.iloc[:, list(columns)]
            cic_test_data = cic_test_data.iloc[:, list(columns)]
        elif isinstance(columns[0], str):
            cic_train_data = cic_train_data.filter(columns)
            cic_test_data = cic_test_data.filter(columns)

    # ### Label translation
    # As we are doing binary classification,
    # we only need to know if the entry is normal/benign (*0*) or malicious (*1*)
    # Also, by default the encoding starts with BENIGN -> 1
    def translate_to_binary(label_value):
        return 0 if label_value == 1 else 1
    translate_to_binary = np.vectorize(translate_to_binary)

    cic_train_labels = translate_to_binary(cic_train_labels['label_encoded'].values)
    cic_test_labels = translate_to_binary(cic_test_labels['label_encoded'].values)
    print("")
    print("No of train entries:\t", len(cic_train_data))
    print("No of train labels:\t", len(cic_train_labels))
    print("-----------")
    print("No of test entries:\t", len(cic_test_data))
    print("No of test labels:\t", len(cic_test_labels))

    return (cic_train_data, cic_test_data, cic_train_labels, cic_test_labels)