000_women_or_children_first.py [download]


#!/usr/bin/env python3

import sys
import pandas as pd
import sklearn.metrics

def get_data(filename):
    """
    ### Assumes column 0 is the instance index stored in the
    ### csv file.  If no such column exists, remove the
    ### index_col=0 parameter.

    Assumes the column named "Cabin" should be a interpreted 
    as a string, but Pandas can't figure that out on its own.

    ###Request missing values (blank cells) to be left as empty strings.

    https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
    """

    ###, index_col=0
    ###, keep_default_na=False
    data = pd.read_csv(filename, dtype={ "Cabin": str })
    return data


def predict_women_survive(data):
    result = data["Sex"] == "female"
    return result

def predict_children_survive(data, age):
    result = data["Age"] < age
    return result

def predict_women_children_survive(data, age):
    result = (data["Sex"] == "female") | (data["Age"] < age)
    return result

def tree_predict_survive(data, age):
    """
                     female?
                    /       \
                  yes       no
                  |          |
             Pclass?         age < 10
            /   |   \       /        \
           1    2    3     yes        no
           |    |    |     |           |
     survive survive die  survive     die

    predict women survive if first or second class, but not if third class
    predict men survive is less than age, but not if equal to or more than age

    """
    result1 = (data["Sex"] == "female") & ((data["Pclass"] == 1) | (data["Pclass"] == 2))
    result2 = (data["Sex"] != "female") & (data["Age"] < age)
    result = result1 | result2
    return result


def manual_metric(y, yhat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(yhat)):
        if yhat[i]:
            if y.iloc[i] == 1:
                TP += 1
            else:
                FP += 1
        else:
            if y.iloc[i] == 1:
                FN += 1
            else:
                TN += 1

    precision = TP/(TP+FP)            
    recall = TP/(TP+FN)
    f1 = 2.0 / ( (1.0/precision) + (1.0/recall) )
    table = "+-----+-----+\n|{:4d} |{:4d} |\n+-----+-----+\n|{:4d} |{:4d} |\n+-----+-----+\n".format(TN, FN, FP, TP)
    print(table)
    print()
    print("precision: {}".format(precision))
    print("recall: {}".format(recall))
    print("f1: {}".format(f1))
    return

def sklearn_metric(y, yhat):
    cm = sklearn.metrics.confusion_matrix(y, yhat)
    table = "+-----+-----+\n|{:4d} |{:4d} |\n+-----+-----+\n|{:4d} |{:4d} |\n+-----+-----+\n".format(cm[0][0], cm[1][0], cm[0][1], cm[1][1])
    print(table)
    print()
    precision = sklearn.metrics.precision_score(y, yhat)
    recall = sklearn.metrics.recall_score(y, yhat)
    f1 = sklearn.metrics.f1_score(y, yhat)
    print("precision: {}".format(precision))
    print("recall: {}".format(recall))
    print("f1: {}".format(f1))
    return


def main(argv):
    filename = "titanic-train.csv"
    data = get_data(filename)

    predictions = {}
    predictions["female"] = predict_women_survive(data)
    predictions["children"] = predict_children_survive(data, 10)
    predictions["female or children"] = predict_women_children_survive(data, 10)
    predictions["tree"] = tree_predict_survive(data, 10)

    for key in predictions:
        print()
        print("========================================")
        print()
        print("Prediction Criteria: {}".format(key))
        sklearn_metric(data["Survived"], predictions[key])
        print()

    # for age in range(1,100):
    #     f1 = sklearn.metrics.f1_score(data["Survived"], predictions)
    #     print(age, f1)
    #
    # manual_metric(data["Survived"], predictions)
    # print()
    # print()
    #
    # sklearn_metric(data["Survived"], predictions)

    return

if __name__ == "__main__":
    main(sys.argv)
Last Updated 02/01/2024