Source code for MLT.tools.helper_sklearn

"""Utility functions for scikit-learn-realted implementations"""
import os
from datetime import datetime
from sklearn.externals import joblib

from MLT.tools import prediction_entry as pe

[docs]def sklearn_train_model(model, training_data, training_labels, test_data, test_labels, model_savename):
    """Train the given model with data and predict the run"""
    starttime = datetime.now()
    model.fit(training_data, training_labels)
    finishtime = datetime.now()
    runtime = finishtime - starttime

    #predict the run
    test_predictions = model.predict(test_data)
    test_predictions_probabilities = model.predict_proba(test_data)[:, 1]
    # proba[:,1] returns just 1 of 2 columns. As they always add up, this is enough!

    sklearn_persist_model(model, model_savename)
    # append all this to a dataframe / JSON / whatever and
    return pe.PredictionEntry(test_labels, test_predictions, test_predictions_probabilities, runtime)


[docs]def sklearn_persist_model(model, model_savename):
    """Save a scikit model to disk"""
    joblib.dump(model, model_savename + '.pkl')

[docs]def sklearn_load_model(dirpath, modelname):
    """Load a scikit model from disk"""
    model_path = os.path.join(dirpath, modelname)
    return joblib.load(model_path)

[docs]def sklearn_load_modellist(model_filenames, model_path):
    """Load a list of scikit models from disk from given path"""
    loaded_models = []
    for model_fname in model_filenames:
        filename_wo_ext = os.path.splitext(model_fname)[0]
        loaded_models.append(
            (
                filename_wo_ext,
                sklearn_load_model(model_path, model_fname)
            )
        )
    return loaded_models


[docs]def predict_scikit(single_model, test_data, test_labels):
    """Only predict a model without fitting it"""
    starttime = datetime.now()

    test_predictions = single_model.predict(test_data)
    test_predictions_probabilities = single_model.predict_proba(test_data)[:, 1]

    finishtime = datetime.now()
    runtime = finishtime - starttime

    return pe.PredictionEntry(test_labels, test_predictions, test_predictions_probabilities, runtime)