Source code for MLT.datasets.pickleCIC

# coding: utf-8

# # Data Loading and Joining

import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer


CIC_FOLDER_PATH = (os.path.join(os.path.dirname(__file__), 'CICIDS2017pub'))


[docs]def prepare_dataset(): """Base function for dataset loading and preparation. Returns -------- data : tuple A tuple consisting of (cic_data (Pandas.DataFrame), cic_labels (Pandas.DataFrame), group_list (List)) """ cic_data = pd.DataFrame() datafile_names_sorted = [ 'Monday-WorkingHours.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv' ] for filename in datafile_names_sorted: inputFileName = os.path.join(CIC_FOLDER_PATH, filename) print('Appending', inputFileName) try: new_flows = pd.read_csv( inputFileName, na_values=["Infinity", "NaN"], skipinitialspace=True, ) cic_data = cic_data.append(new_flows, ignore_index=True, sort=False) except FileNotFoundError: print("WARNING: Could not find CSV! Check your path!") exit() # ### NaN handling print("Handling NaN's") # First of all, have a look for undefined / NaN fields. # Whilst dropping the whole entry because of these might seem counterintuitive from a science/research perspective, # this is a valid approach in the domain of network equipment (drop package if something is wrong). cic_data.dropna(inplace=True) # **Important**: As we have dropped multiple rows, we need to generate a new index for the DataFrame, as the old one has some gaps now! cic_data.index = range(len(cic_data)) # ## At first glance print("Dropping always-0-fields") # Having a look at the value distribution reveals that the dataset contains multiple fields that only carry zeroes. # We might as well drop these as they only pollute the dataset. zero_only = ['Bwd PSH Flags', 'Bwd URG Flags', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate'] cic_data.drop(zero_only, axis=1, inplace=True) # ## Fixing the labels print("Fixing labels") cic_data['Label'] = cic_data['Label'].str.replace('.*XSS', 'WebAttackXSS', regex=True) cic_data['Label'] = cic_data['Label'].str.replace('.*Brute Force', 'WebAttackBrute Force', regex=True) cic_data['Label'] = cic_data['Label'].str.replace('.*Sql Injection', 'WebAttackSql Injection', regex=True) cic_data['Label'] = cic_data['Label'].str.replace(' ', '', regex=True) labels = cic_data[['Label']] cic_data.drop(['Label'], axis=1, inplace=True) # ## Label tokenization print("Tokenizing labels") # As algorithms cannot work with text-labels directly, lets tokenize them! number_of_classes = len(labels['Label'].unique()) # tokenize the LABELS label_tokenizer = Tokenizer(num_words=number_of_classes+1, filters='') label_tokenizer.fit_on_texts(labels['Label'].unique()) # Run the fitted tokenizer on the label column and save the encoded data as dataframe enc_labels = pd.DataFrame.from_records(label_tokenizer.texts_to_sequences(labels['Label']), columns=['label_encoded']) # To be able to translate the encoded labels back, write the Tokenizer wordlist to a file near the CSVs. filename = os.path.join(CIC_FOLDER_PATH, 'cic_label_wordindex.json') print('Writing encoder data to file {}:\n\t{}'.format(filename, label_tokenizer.word_index)) with open(filename, 'w') as outfile: json.dump(label_tokenizer.word_index, outfile) # Make sure that the shape and index of both data structures adds up, then join them together assert enc_labels.shape[0] == labels.shape[0] assert list(enc_labels.index.values) == list(labels.index.values) labels = pd.concat([labels, enc_labels], axis=1, sort=False) # ## Converting Destination Port Information print("Converting and encoding Dest Port info") # The idea is to group source and destination ports into three categories: # - 0 - 1023: system / well known ports # - 1024 - 49151: user / registered ports # - \> 49151: dynamic / private ports # # # Whereas the well known ports will stay untouched, whilst registered and dynamic ports will be converted into their category. # set every port to -2 if it is greater than 49151 - reflecting dynamic ports cic_data['Destination Port'].where(cic_data['Destination Port'] < 49151, -1, inplace=True) # set every port to -1 if it is greater than 1024 - reflecting registered ports cic_data['Destination Port'].where(cic_data['Destination Port'] < 1023, -2, inplace=True) # ## OHE Destination Ports # The ports are a categorical feature that should be used by the ML algos, so it needs to be OHE. # To lower the class count, we've already established two meta-cagegories, reserved and dynamic ports. def one_hot_encode_drop(dframe, column, prefix): dums = pd.get_dummies(dframe[column], prefix=prefix) dframe.drop(column, axis=1, inplace=True) dframe = pd.concat([dframe, dums], axis=1, sort=False) return dframe cic_data = one_hot_encode_drop(cic_data, 'Destination Port', 'Destination Port') return (cic_data, labels)
[docs]def pickleCIC_randomized(): """Pulls a randomized test partition and pickles it to disk.""" cic_data, cic_labels = prepare_dataset() # ## Train / Test Split train_data, test_data, train_labels, test_labels = train_test_split(cic_data, cic_labels, test_size=0.33, random_state=0) # Just to be sure that there were no dumb mistakes made, double check the sizes. # Shape 0 is the number of entries, which absolutely should match! assert test_labels.shape[0] == test_data.shape[0] assert train_labels.shape[0] == train_data.shape[0] assert list(train_data.index.values) == list(train_labels.index.values) assert list(test_data.index.values) == list(test_labels.index.values) print("\nNo of train flows:", len(train_data)) print("No of train labels:", len(train_labels)) print("Train Label classes: ", train_labels['Label'].unique()) print("Encoded Train Label classes: ", train_labels['label_encoded'].unique()) print("-------------------") print("No of test flows:", len(test_data)) print("No of test labels:", len(test_labels)) print("Test Label classes: ", test_labels['Label'].unique()) print("Encoded Test Label classes: ", test_labels['label_encoded'].unique()) # ## Feature Standardization # Standartization and Scaling are done while k-fold crossval! print('Serializing to disk...') # ## Serialization # So at this point, we have training and test sets with data and labels. The data parts are encoded and scaled, the encoded indizes are written away as json files. # It would be nice if this data could be used for future runs, right? Right! # That's why we serialize each dataframe into a python binary pickle on it's own (which is a feature directly supported by [Pandas](https://pandas.pydata.org/pandas-docs/stable/api.html#id12) - nice, eh?) # Serialize the result to disk! train_data.to_pickle(os.path.join(CIC_FOLDER_PATH, 'cic_train_data_rand.pkl')) train_labels.to_pickle(os.path.join(CIC_FOLDER_PATH, 'cic_train_labels_rand.pkl')) test_data.to_pickle(os.path.join(CIC_FOLDER_PATH, 'cic_test_data_rand.pkl')) test_labels.to_pickle(os.path.join(CIC_FOLDER_PATH, 'cic_test_labels_rand.pkl')) print('Finished randomized CICIDS2017 serialization')