DEPARTMENT OF COMPUTING

preprocess.py [download]


#!/usr/bin/env python3

from pipeline_elements import *
import sklearn.impute
import sklearn.preprocessing
import sklearn.pipeline
import pandas as pd
import numpy as np
import joblib
import os

def make_numerical_feature_pipeline():
    items = []
    items.append(("numerical-features-only", DataFrameSelector(do_predictors=True, do_numerical=True)))
    items.append(("missing-data", sklearn.impute.SimpleImputer(strategy="median")))
    items.append(("scaler", sklearn.preprocessing.StandardScaler()))
    numerical_pipeline = sklearn.pipeline.Pipeline(items)
    return numerical_pipeline


def make_categorical_feature_pipeline():
    items = []
    items.append(("categorical-features-only", DataFrameSelector(do_predictors=True, do_numerical=False)))
    items.append(("missing-data", sklearn.impute.SimpleImputer(strategy="constant", fill_value="NULL")))
    items.append(("encode-category-bits", sklearn.preprocessing.OneHotEncoder(categories='auto', handle_unknown='ignore')))
    categorical_pipeline = sklearn.pipeline.Pipeline(items)
    return categorical_pipeline

def make_feature_pipeline():
    items = []
    items.append(("numerical", make_numerical_feature_pipeline()))
    items.append(("categorical", make_categorical_feature_pipeline()))
    pipeline = sklearn.pipeline.FeatureUnion(transformer_list=items)
    return pipeline

def preprocess_dataframe(pipeline, dataframe, label):
    """
    Preprocess a dataframe with the given pipeline.
    Assumes the pipeline has been fit.
    Assumes dataframe has an index column, and preserves it.
    If dataframe has a series identified by label, it is preserved.
    Assumes all other columns are features, and transforms them.
    """
    
    # list of features to transform
    feature_names = list(dataframe.columns)
    have_label = label in feature_names
    if have_label:
        feature_names.remove(label)
        
    # separate features and label
    X = dataframe[feature_names]
    if have_label:
        y = dataframe[label]
    
    # transform features
    X_transformed = pipeline.transform(X)
    
    # if the transform became sparse, densify it.
    # this usually happens because of one-hot-encoding
    if not isinstance(X_transformed, np.ndarray):
        X_transformed = X_transformed.todense()

    # reconstruct a dataframe, we've lost the labels of features. Too bad.
    df1 = pd.DataFrame(X_transformed)
    
    if have_label:
        # add labels
        df1[label] = y.to_numpy()

    # replace indexes
    df1.index = dataframe.index

    return df1

def fit_pipeline_to_dataframe(dataframe):
    pipeline = make_feature_pipeline()
    pipeline.fit(dataframe)
    return pipeline

def save_pipeline(pipeline, filename):
    joblib.dump(pipeline, filename)
    return

def load_pipeline(filename):
    pipeline = joblib.load(filename)
    return pipeline

def preprocess_file(input_filename, output_filename, pipeline_filename, label):
    dataframe = pd.read_csv(input_filename, index_col=0)
    if not os.path.exists(pipeline_filename):
        pipeline = fit_pipeline_to_dataframe(dataframe)
        save_pipeline(pipeline, pipeline_filename)
    else:
        pipeline = load_pipeline(pipeline_filename)

    processed_dataframe = preprocess_dataframe(pipeline, dataframe, label)
    processed_dataframe.to_csv(output_filename, index=True)
    
    return

def main_train():
    data_filename = "loan-train.csv"
    out_filename = "loan-preprocessed-train.csv"
    pipeline_filename = "loan-preprocessor.joblib"
    label = "loan_status"
    preprocess_file(data_filename, out_filename, pipeline_filename, label)
    return

def main_test():
    data_filename = "loan-test.csv"
    out_filename = "loan-preprocessed-test.csv"
    pipeline_filename = "loan-preprocessor.joblib"
    label = "loan_status"
    preprocess_file(data_filename, out_filename, pipeline_filename, label)
    return

def main():
    # main_train()
    main_test()

if __name__ == "__main__":
    main()

Last Updated 02/20/2025