# Import Packages

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
 StandardScaler,
 MinMaxScaler,
 OneHotEncoder,
 OrdinalEncoder
)

from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.outliers.winsorizer import Winsorizer

import mlflow

from sklearn.metrics import (
 accuracy_score, 
 precision_score, 
 recall_score, 
 f1_score
)

from sklearn.metrics import ConfusionMatrixDisplay

# Load the Data

In [None]:
# read the csv file

df = pd.read_csv("data/titanic.csv")

df.head()

In [None]:
# check for missing values in data

(
 df
 .isna()
 .sum()
)

**Observations**:
1. `Age`, `Emabrked` and `Cabin` columns have missing values.

In [None]:
# info about the data

df.info()

# Data CLeaning

In [None]:
# columns to drop

columns_to_drop = ['passengerid','name','ticket','cabin']

In [None]:
def clean_data(df):
 return (
 df
 .rename(columns=str.lower)
 .drop(columns=columns_to_drop)
 .assign(
 family = lambda df_ : df_['sibsp'] + df_['parch']
 )
 .drop(columns=['sibsp','parch'])
 )

In [None]:
final_df = clean_data(df)

final_df.head()

In [None]:
# shape of the cleaned data 

print(f'The cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns')

In [None]:
# missing values in the cleaned data

(
 final_df
 .isna()
 .sum()
)

# EDA

In [None]:
# distribution of target

(
 final_df
 .loc[:,'survived']
 .value_counts(normalize=True)
)

In [None]:
# boxplots

def create_boxplot(data,column_name,hue=None):
 sns.boxplot(data=data, y=column_name, hue=hue)

In [None]:
# boxplot for age column
create_boxplot(final_df,'age')

In [None]:
# boxplot for fare column

create_boxplot(final_df,'fare')

**Overview**
- Outliers in the age and fare columns

In [None]:
# plot the distribution of categorical columns

def plot_distribution(data,column_name):
 sns.countplot(data=data, x=column_name)

In [None]:
# distribution for pclass
plot_distribution(final_df,'pclass')

In [None]:
# distribution for sex

plot_distribution(final_df,'sex')

In [None]:
# distribution for embarked 

plot_distribution(final_df,'embarked')

# Feature_Eng

In [None]:
final_df.head()

In [None]:
# make X and y

X = final_df.drop(columns=['survived'])
y = final_df['survived']

In [None]:
X.head()

In [None]:
# do train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print('The shape of training data is',X_train.shape)
print('The shape of testing data is',X_test.shape)

## Pipelines for Individual Columns

In [None]:
# age_pipeline
age_pipe = Pipeline(steps=[
 ('impute',SimpleImputer(strategy='median')),
 ('outliers',Winsorizer(capping_method='gaussian',fold=3)),
 ('scale',StandardScaler())
])


age_pipe

In [None]:
# fare pipeline

fare_pipe = Pipeline(steps=[
 ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),
 ('scale',StandardScaler())
])

fare_pipe

In [None]:
# embarked_pipeline

embarked_pipe = Pipeline(steps=[
 ('impute',SimpleImputer(strategy='most_frequent')),
 ('count_encode',CountFrequencyEncoder(encoding_method='count')),
 ('scale',MinMaxScaler())
])

embarked_pipe

## Column Transformer

In [None]:
set_config(transform_output='pandas')

In [None]:
# make column column transformer

preprocessor = ColumnTransformer(transformers=[
 ('age',age_pipe,['age']),
 ('fare',fare_pipe,['fare']),
 ('embarked',embarked_pipe,['embarked']),
 ('sex',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['sex']),
 ('family',MinMaxScaler(),['family'])
],remainder='passthrough',n_jobs=-1,force_int_remainder_cols=False)

preprocessor

In [None]:
# fit and transform the training data

preprocessor.fit_transform(X_train)

In [None]:
preprocessor.get_params()

# Model Pipeline

In [None]:
# build the model pipeline

model_params = {'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': 0.8,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 300,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 30,
 'verbose': 0,
 'warm_start': False}

In [None]:
model_pipe = Pipeline(steps=[
 ('preprocessor',preprocessor),
 ('clf',RandomForestClassifier(**model_params))
])

model_pipe

In [None]:
# fit the model on the training data

model_pipe.fit(X_train,y_train)

In [None]:
# evaluate the model on the test data

y_pred = model_pipe.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred).item()
recall = recall_score(y_test,y_pred).item()
f1 = f1_score(y_test,y_pred).item()

In [None]:
# metrics dict

metrics = {
 'accuracy': accuracy,
 'precision': precision,
 'recall': recall,
 'f1_score': f1
}

metrics

In [None]:
# plot confusion matrix

cm = ConfusionMatrixDisplay.from_predictions(y_test,y_pred)

# MLFlow Tracking code

In [None]:
model_pipe.get_params()

In [None]:
X_test.join(y_test)

In [None]:
# set the uri for server

mlflow.set_tracking_uri("http://127.0.0.1:8080")

mlflow.set_experiment("Mentos Zindagi")

with mlflow.start_run() as run:
 # log the data signature
 data_signature = mlflow.models.infer_signature(model_input=X_train,model_output=model_pipe.predict(X_train))

 # log preprocessor parameters
 mlflow.log_params(model_pipe.get_params())

 # log model metrics
 mlflow.log_metrics(metrics)
 
 # log the model
 mlflow.sklearn.log_model(sk_model=model_pipe,artifact_path="model.pkl",signature=data_signature)

 # Get the model uri
 model_uri = mlflow.get_artifact_uri("model.pkl")
 
 # # evaluate the model
 # evaluations = mlflow.models.evaluate(model=model_uri,
 # data=X_test.join(y_test),
 # targets='survived',
 # model_type="classifier")

 # log the confusion matrix
 mlflow.log_figure(cm.figure_,artifact_file='confusion_matrix.png')