Feature Engineering

Shows the usage of aiking library on a kaggle dataset

Import public packages

import fastcore
import pandas as pd
import pathlib
from fastcore.all import *
from fastcore.imports import *
import os
import sys
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split, cross_validate
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix
from IPython.display import display
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Import private packages

is_kaggle = 'kaggle_secrets' in sys.modules
if is_kaggle:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("kaggle_username")
    if not os.environ['KAGGLE_USERNAME']: raise Exception("Please insert your Kaggle username and key into Kaggle secrets")
    os.environ['KAGGLE_KEY'] = user_secrets.get_secret("kaggle_key")
    github_pat = user_secrets.get_secret("GITHUB_PAT")
    !pip install -Uqq git+https://{github_pat}@github.com/Rahuketu86/aiking
else:
    from aiking.data.external import *
    path = untar_data("kaggle_competitions::titanic"); 
    print(path.ls())
[Path('/AIKING_HOME/data/titanic/gender_submission.csv'), Path('/AIKING_HOME/data/titanic/test.csv'), Path('/AIKING_HOME/data/titanic/train.csv')]
from aiking.ml.structured import *

Read the Dataset

data_dir = pathlib.Path(os.getenv('DATA_DIR', "/kaggle/input")); 
path = data_dir/"titanic"
path.ls()
(#3) [Path('/kaggle/input/titanic/gender_submission.csv'),Path('/kaggle/input/titanic/test.csv'),Path('/kaggle/input/titanic/train.csv')]
df_train = pd.read_csv(path/"train.csv"); df_train.head()
df_test = pd.read_csv(path/"test.csv"); df_test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

EDA

display(df_train.describe(include='number').T, df_test.describe(include='number').T)
count mean std min 25% 50% 75% max
PassengerId 891.0 446.000000 257.353842 1.00 223.5000 446.0000 668.5 891.0000
Survived 891.0 0.383838 0.486592 0.00 0.0000 0.0000 1.0 1.0000
Pclass 891.0 2.308642 0.836071 1.00 2.0000 3.0000 3.0 3.0000
Age 714.0 29.699118 14.526497 0.42 20.1250 28.0000 38.0 80.0000
SibSp 891.0 0.523008 1.102743 0.00 0.0000 0.0000 1.0 8.0000
Parch 891.0 0.381594 0.806057 0.00 0.0000 0.0000 0.0 6.0000
Fare 891.0 32.204208 49.693429 0.00 7.9104 14.4542 31.0 512.3292
count mean std min 25% 50% 75% max
PassengerId 418.0 1100.500000 120.810458 892.00 996.2500 1100.5000 1204.75 1309.0000
Pclass 418.0 2.265550 0.841838 1.00 1.0000 3.0000 3.00 3.0000
Age 332.0 30.272590 14.181209 0.17 21.0000 27.0000 39.00 76.0000
SibSp 418.0 0.447368 0.896760 0.00 0.0000 0.0000 1.00 8.0000
Parch 418.0 0.392344 0.981429 0.00 0.0000 0.0000 0.00 9.0000
Fare 417.0 35.627188 55.907576 0.00 7.8958 14.4542 31.50 512.3292
display(df_train.describe(include='object').T, df_test.describe(include='object').T)
count unique top freq
Name 891 891 Braund, Mr. Owen Harris 1
Sex 891 2 male 577
Ticket 891 681 347082 7
Cabin 204 147 B96 B98 4
Embarked 889 3 S 644
count unique top freq
Name 418 418 Kelly, Mr. James 1
Sex 418 2 male 266
Ticket 418 363 PC 17608 5
Cabin 91 76 B57 B59 B63 B66 3
Embarked 418 3 S 270
display(df_train['Cabin'].str[0], df_train['Cabin'].str[1:])
0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: Cabin, Length: 891, dtype: object
0      NaN
1       85
2      NaN
3      123
4      NaN
      ... 
886    NaN
887     42
888    NaN
889    148
890    NaN
Name: Cabin, Length: 891, dtype: object
# [i.rsplit(" ", -1) for i in df_train['Ticket'].tolist()]
def get_ticket_features(row):
    if len(row) == 2: return [row[0], int(row[1])]
    else:
        if row[0].isdigit(): return [pd.NA, int(row[0])]
        else: return [row[0], pd.NA]
s = df_train['Ticket'].str.rsplit(" ", 1).apply(get_ticket_features) #.apply(lambda row: len(row)).hist()
df = pd.DataFrame(s.tolist(), columns=['prefix_ticket', 'num_ticket'])
display(df.nunique(), df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   prefix_ticket  230 non-null    object
 1   num_ticket     887 non-null    object
dtypes: object(2)
memory usage: 14.0+ KB
prefix_ticket     44
num_ticket       678
dtype: int64
None
df_train['Name'][1]
'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'

Modelling

Define Pipeline

def get_model_pipeline(max_n_cat=0, 
                       cat_dict=None, 
                       scale_dict={'class': StandardScaler},
                       cat_num_dict={'class':NumericalEncoder,'categories':None},
                       cat_dummy_dict={'class':OneHotEncoder,'handle_unknown':'ignore'},
                       imputer_dict={'class':SimpleImputer, 'strategy':'median'},
                      ):
    layer_spec_default = (get_default_feature_def, 
                      {
                          'skip_flds':None, 
                          'ignored_flds':None, 
                          'max_n_cat':max_n_cat, 
                          'na_exclude_cols':[],
                          'scale_var_num':True,
                          'scale_var_cat':False,
                          'scale_dict':scale_dict,
                          'cat_num_dict':cat_num_dict,
                          'cat_dummy_dict':cat_dummy_dict,
                          'imputer_dict':imputer_dict,
                          'include_time_cols':True,
                          'keep_dt_cols':False,
                          'cat_dict':cat_dict
                      }
                     )

    layer_specs = [layer_spec_default]
    proc = Proc(layer_specs=layer_specs); #proc.fit_transform(X)
    model = RandomForestClassifier(n_jobs=-1)
    pipeline = make_pipeline(proc, model); pipeline
    return pipeline
pipeline = get_model_pipeline(cat_dict=None); pipeline
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x7fd5fa242610>),
                ('randomforestclassifier', RandomForestClassifier(n_jobs=-1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Train on Partial Data

max_n_cat = 5

def get_xy(df, col='Survived'): return df.drop([col], axis=1), df[col]
    
X, y = get_xy(df_train)

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
pipeline.fit(X, y)
Pipeline(steps=[('proc', <aiking.ml.structured.Proc object at 0x7fd68e075430>),
                ('randomforestclassifier', RandomForestClassifier(n_jobs=-1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
get_scorer_dict(scorer_names=['accuracy', 'precision', 'recall', 'roc_auc'])
{'accuracy': make_scorer(accuracy_score),
 'precision': make_scorer(precision_score, average=binary),
 'recall': make_scorer(recall_score, average=binary),
 'roc_auc': make_scorer(roc_auc_score, needs_threshold=True)}

This gives an indication of estimate of msle around .26 to .30[Really 0.304 from validation estimate]

Cross validation estimate

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
# scores = cross_val_score(pipeline, X, y, scoring='accuracy'); scores
scores_df = pd.DataFrame(cross_validate(pipeline, X, y, scoring=['accuracy', 'precision', 'recall', 'roc_auc'])); scores_df
fit_time score_time test_accuracy test_precision test_recall test_roc_auc
0 0.192457 0.113971 0.804469 0.723684 0.797101 0.850461
1 0.135919 0.103486 0.769663 0.675325 0.764706 0.818249
2 0.138929 0.095469 0.820225 0.725000 0.852941 0.902807
3 0.140578 0.116264 0.808989 0.765625 0.720588 0.869786
4 0.138751 0.102551 0.859551 0.768293 0.913043 0.904933
scores_df.plot()
<AxesSubplot:>

# pipeline = get_model_pipeline(cat_dict); pipeline
y_pred = cross_val_predict(pipeline, X, y)
# cm = confusion_matrix(y, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot()
ConfusionMatrixDisplay.from_predictions(y, y_pred)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>

scores = scores_df['test_accuracy']
print(f"Expected Scores {scores.mean() - 3*scores.std():.2%} to {scores.mean() + 3*scores.std():.2%} with mean as {scores.mean():.2%}")
Expected Scores 71.55% to 90.96% with mean as 81.26%

Predictions

Retrain pipeline on complete dataset

pipeline = get_model_pipeline(max_n_cat,cat_dict=None)
pipeline.fit(X, y)
y_pred = pipeline.predict(X)
cm = confusion_matrix(y, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
display(disp.plot(), accuracy_score(y, y_pred))
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay>
0.9988776655443322

Calculation for test set and submission

df_sample_submission = pd.read_csv(path/"gender_submission.csv"); df_sample_submission.head()
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
os.getcwd()
'/kaggle/working/nbs/book/competitions/titanic'
predictions = pd.DataFrame(pipeline.predict(df_test), columns=[y.squeeze().name]); predictions
df_submission = pd.concat([df_test['PassengerId'], predictions], axis=1); df_submission
df_submission.to_csv('submission.csv', index=False)
if not is_kaggle:
    import kaggle
    kaggle.api.competition_submit_cli("submission.csv", "Submission from local machine", competition="titanic")
    # from aiking.integrations.kaggle import push2kaggle
    # push2kaggle("00_index.ipynb")
100%|██████████| 2.77k/2.77k [00:04<00:00, 627B/s]