"""Miscellaneous dataset tools and helper functions"""
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.decomposition import PCA
def prin_com_analysis(train_data, test_data, variance=0.95):
print("Beginning PCA")
pca = PCA(variance)
pca.fit(train_data)
train_data = pca.transform(train_data)
test_data = pca.transform(test_data)
return train_data, test_data
# Normalization and Scaling
[docs]def standard_scale(train_data, test_data):
"""Scale given data with a StandardScaler trained on the train data.
Args:
train_data (Pandas.DataFrame or Numpy.ndarray): Training data to scale
test_data (Pandas.DataFrame or Numpy.ndarray): Test data to scale
Returns:
train_data, test_data (Numpy.ndarray): The transformed data sets
"""
scaler = preprocessing.StandardScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)
return train_data, test_data
[docs]def min_max_scale(train_data, test_data):
"""Scale given data with a MinMaxScaler trained on the train data.
Args:
train_data (Pandas.DataFrame or Numpy.ndarray): Training data to scale
test_data (Pandas.DataFrame or Numpy.ndarray): Test data to scale
Returns:
train_data, test_data (Numpy.ndarray): The transformed data sets
"""
scaler = preprocessing.MinMaxScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)
return train_data, test_data
[docs]def abs_scaler(train_data, test_data):
"""Scale given data with a MaxAbsScaler trained on the train data.
Args:
train_data (Pandas.DataFrame or Numpy.ndarray): Training data to scale
test_data (Pandas.DataFrame or Numpy.ndarray): Test data to scale
Returns:
train_data, test_data (Numpy.ndarray): The transformed data sets
"""
scaler = preprocessing.MaxAbsScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)
return train_data, test_data
# load pandas df pickles from disk
[docs]def load_df(filename, folderpath):
"""Helper function to load Dataframes from a given folder"""
filepath = os.path.join(folderpath, filename + '.pkl')
return pd.read_pickle(filepath)