{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7dd3aed1-8c77-491a-beb4-6658b3e603b6",
   "metadata": {},
   "source": [
    "# Import Packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1b9541c-7de1-4c89-9424-01058657d4b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import set_config\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
    "\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.preprocessing import (\n",
    "    StandardScaler,\n",
    "    MinMaxScaler,\n",
    "    OneHotEncoder,\n",
    "    OrdinalEncoder\n",
    ")\n",
    "\n",
    "from feature_engine.encoding import CountFrequencyEncoder\n",
    "from feature_engine.outliers.winsorizer import Winsorizer\n",
    "\n",
    "import mlflow\n",
    "\n",
    "from sklearn.metrics import (\n",
    "    accuracy_score, \n",
    "    precision_score, \n",
    "    recall_score, \n",
    "    f1_score\n",
    ")\n",
    "\n",
    "from sklearn.metrics import ConfusionMatrixDisplay"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f44afcc-35a3-4e78-8b0f-1bff5cac2f42",
   "metadata": {},
   "source": [
    "# Load the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc883d66-7142-451c-b7a7-a88407311855",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the csv file\n",
    "\n",
    "df = pd.read_csv(\"data/titanic.csv\")\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74d95fa4-20c7-4e1a-a34a-438343bf1b89",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check for missing values in data\n",
    "\n",
    "(\n",
    "    df\n",
    "    .isna()\n",
    "    .sum()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4406de8-2796-471b-9b1d-37f324eb25fa",
   "metadata": {},
   "source": [
    "**Observations**:\n",
    "1. `Age`, `Emabrked` and `Cabin` columns have missing values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c73034ac-df11-42dd-8238-c7ff9de91979",
   "metadata": {},
   "outputs": [],
   "source": [
    "# info about the data\n",
    "\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34bdfe67-8229-491e-b08f-2388aea5aab6",
   "metadata": {},
   "source": [
    "# Data CLeaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f67329d-b6f3-4486-8ca0-bebfac68d258",
   "metadata": {},
   "outputs": [],
   "source": [
    "# columns to drop\n",
    "\n",
    "columns_to_drop = ['passengerid','name','ticket','cabin']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eae542f3-ee1c-4e5f-8600-85a29a7ec48a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_data(df):\n",
    "    return (\n",
    "        df\n",
    "        .rename(columns=str.lower)\n",
    "        .drop(columns=columns_to_drop)\n",
    "        .assign(\n",
    "            family = lambda df_ : df_['sibsp'] + df_['parch']\n",
    "        )\n",
    "        .drop(columns=['sibsp','parch'])\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4465d425-1dd4-49be-9b1b-d7876fb42277",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df = clean_data(df)\n",
    "\n",
    "final_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37cef40c-628a-42a9-934a-ae3461d46853",
   "metadata": {},
   "outputs": [],
   "source": [
    "# shape of the cleaned data \n",
    "\n",
    "print(f'The cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cebfd73f-5ede-4a17-be63-7355369997f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# missing values in the cleaned data\n",
    "\n",
    "(\n",
    "    final_df\n",
    "    .isna()\n",
    "    .sum()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "087aedb7-b716-4d10-8e03-d9a9149e3c57",
   "metadata": {},
   "source": [
    "# EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "075fc561-597a-48c8-9da4-718e1f0f21e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# distribution of target\n",
    "\n",
    "(\n",
    "    final_df\n",
    "    .loc[:,'survived']\n",
    "    .value_counts(normalize=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c414edaf-7749-4f0d-bc77-288f1846379e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# boxplots\n",
    "\n",
    "def create_boxplot(data,column_name,hue=None):\n",
    "    sns.boxplot(data=data, y=column_name, hue=hue)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "053c8ad1-307a-4182-b798-aecd2e56e349",
   "metadata": {},
   "outputs": [],
   "source": [
    "# boxplot for age column\n",
    "create_boxplot(final_df,'age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4e6b0c1-beb6-4eb4-a1a3-e1ed297b7ac7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# boxplot for fare column\n",
    "\n",
    "create_boxplot(final_df,'fare')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fc3dc52-6c52-4cef-b40d-f8b3f2553882",
   "metadata": {},
   "source": [
    "**Overview**\n",
    "- Outliers in the age and fare columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9eb075d8-c329-45ec-b311-c3ef16c55357",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the distribution of categorical columns\n",
    "\n",
    "def plot_distribution(data,column_name):\n",
    "    sns.countplot(data=data, x=column_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8b1d684-37d7-445a-91cf-d017e5f1efa2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# distribution for pclass\n",
    "plot_distribution(final_df,'pclass')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ea410f0-8c0b-4281-acd8-9aecde4ee2d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# distribution for sex\n",
    "\n",
    "plot_distribution(final_df,'sex')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d758c8c4-5541-4dac-9696-b0e99dab3979",
   "metadata": {},
   "outputs": [],
   "source": [
    "# distribution for embarked \n",
    "\n",
    "plot_distribution(final_df,'embarked')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7fff975-6e32-43bb-8ec6-6be0a39f5c1e",
   "metadata": {},
   "source": [
    "# Feature_Eng"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "110ea78a-d709-46bc-b6e7-dd813557bec8",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c374064-e47c-40f0-baf7-54e0ff842560",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make X and y\n",
    "\n",
    "X = final_df.drop(columns=['survived'])\n",
    "y = final_df['survived']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51861761-7ee7-4613-9992-2ddfaef05b53",
   "metadata": {},
   "outputs": [],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "503e0bb6-af40-43d8-8614-8c56b5910ae3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# do train test split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n",
    "\n",
    "print('The shape of training data is',X_train.shape)\n",
    "print('The shape of testing data is',X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "970b2558-9fe4-4bf7-9d36-80775f1a640d",
   "metadata": {},
   "source": [
    "## Pipelines for Individual Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce21c311-c9b5-48fb-9619-1c386b95b065",
   "metadata": {},
   "outputs": [],
   "source": [
    "# age_pipeline\n",
    "age_pipe = Pipeline(steps=[\n",
    "    ('impute',SimpleImputer(strategy='median')),\n",
    "    ('outliers',Winsorizer(capping_method='gaussian',fold=3)),\n",
    "    ('scale',StandardScaler())\n",
    "])\n",
    "\n",
    "\n",
    "age_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9bc1761-c7d8-43ab-939e-ca1a84249af5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fare pipeline\n",
    "\n",
    "fare_pipe = Pipeline(steps=[\n",
    "    ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),\n",
    "    ('scale',StandardScaler())\n",
    "])\n",
    "\n",
    "fare_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d588548f-ae54-43d3-8efe-16f34dd66954",
   "metadata": {},
   "outputs": [],
   "source": [
    "# embarked_pipeline\n",
    "\n",
    "embarked_pipe = Pipeline(steps=[\n",
    "    ('impute',SimpleImputer(strategy='most_frequent')),\n",
    "    ('count_encode',CountFrequencyEncoder(encoding_method='count')),\n",
    "    ('scale',MinMaxScaler())\n",
    "])\n",
    "\n",
    "embarked_pipe"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24838a6d-af02-44dc-abfc-addd714f7533",
   "metadata": {},
   "source": [
    "## Column Transformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1af74974-3b86-49ea-b495-663d20edd0a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "set_config(transform_output='pandas')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95f9b639-2194-4cdc-b565-9021eb933aaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make column column transformer\n",
    "\n",
    "preprocessor = ColumnTransformer(transformers=[\n",
    "    ('age',age_pipe,['age']),\n",
    "    ('fare',fare_pipe,['fare']),\n",
    "    ('embarked',embarked_pipe,['embarked']),\n",
    "    ('sex',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['sex']),\n",
    "    ('family',MinMaxScaler(),['family'])\n",
    "],remainder='passthrough',n_jobs=-1,force_int_remainder_cols=False)\n",
    "\n",
    "preprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa6aa741-afc3-449c-b75d-38a1bea32de6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit and transform the training data\n",
    "\n",
    "preprocessor.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ad34e5a-43e4-4e81-b2bb-b92e2c0b90ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor.get_params()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "898afc54-e717-4b3e-9142-c6235abdfe0a",
   "metadata": {},
   "source": [
    "# Model Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5c5d60d-3746-46c1-b15b-0bc59f62a187",
   "metadata": {},
   "outputs": [],
   "source": [
    "# build the model pipeline\n",
    "\n",
    "model_params = {'bootstrap': True,\n",
    " 'ccp_alpha': 0.0,\n",
    " 'class_weight': None,\n",
    " 'criterion': 'gini',\n",
    " 'max_depth': 6,\n",
    " 'max_features': 'sqrt',\n",
    " 'max_leaf_nodes': None,\n",
    " 'max_samples': 0.8,\n",
    " 'min_impurity_decrease': 0.0,\n",
    " 'min_samples_leaf': 1,\n",
    " 'min_samples_split': 2,\n",
    " 'min_weight_fraction_leaf': 0.0,\n",
    " 'monotonic_cst': None,\n",
    " 'n_estimators': 300,\n",
    " 'n_jobs': -1,\n",
    " 'oob_score': False,\n",
    " 'random_state': 30,\n",
    " 'verbose': 0,\n",
    " 'warm_start': False}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b19559c5-53cb-4630-b64d-cbf2a1c9ca39",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_pipe = Pipeline(steps=[\n",
    "    ('preprocessor',preprocessor),\n",
    "    ('clf',RandomForestClassifier(**model_params))\n",
    "])\n",
    "\n",
    "model_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66876201-5959-45ca-9112-ef7d16bf66b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fit the model on the training data\n",
    "\n",
    "model_pipe.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eaf4ffb7-1763-4000-b9bc-3d2a8b776704",
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the model on the test data\n",
    "\n",
    "y_pred = model_pipe.predict(X_test)\n",
    "\n",
    "accuracy = accuracy_score(y_test,y_pred)\n",
    "precision = precision_score(y_test,y_pred).item()\n",
    "recall = recall_score(y_test,y_pred).item()\n",
    "f1 = f1_score(y_test,y_pred).item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b4d315f-690e-442e-b2f0-f1872e6ef579",
   "metadata": {},
   "outputs": [],
   "source": [
    "# metrics dict\n",
    "\n",
    "metrics = {\n",
    "    'accuracy': accuracy,\n",
    "    'precision': precision,\n",
    "    'recall': recall,\n",
    "    'f1_score': f1\n",
    "}\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ba611a6-9d53-4e5a-ab68-7fc8cd615779",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot confusion matrix\n",
    "\n",
    "cm = ConfusionMatrixDisplay.from_predictions(y_test,y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d57486a5-e1e2-43c3-8090-b880b76bad74",
   "metadata": {},
   "source": [
    "# MLFlow Tracking code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25849a92-97bd-4f7e-a40b-4b593697080f",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_pipe.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cee3f45-97ee-4888-bff3-f0f59031d906",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test.join(y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0e312f1-a1c8-491d-86d3-917296af16a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set the uri for server\n",
    "\n",
    "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n",
    "\n",
    "mlflow.set_experiment(\"Mentos Zindagi\")\n",
    "\n",
    "with mlflow.start_run() as run:\n",
    "    # log the data signature\n",
    "    data_signature = mlflow.models.infer_signature(model_input=X_train,model_output=model_pipe.predict(X_train))\n",
    "\n",
    "    # log preprocessor parameters\n",
    "    mlflow.log_params(model_pipe.get_params())\n",
    "\n",
    "    # log model metrics\n",
    "    mlflow.log_metrics(metrics)\n",
    "    \n",
    "    # log the model\n",
    "    mlflow.sklearn.log_model(sk_model=model_pipe,artifact_path=\"model.pkl\",signature=data_signature)\n",
    "\n",
    "    # Get the model uri\n",
    "    model_uri = mlflow.get_artifact_uri(\"model.pkl\")\n",
    "    \n",
    "    # # evaluate the model\n",
    "    # evaluations = mlflow.models.evaluate(model=model_uri,\n",
    "    #                        data=X_test.join(y_test),\n",
    "    #                        targets='survived',\n",
    "    #                        model_type=\"classifier\")\n",
    "\n",
    "    # log the confusion matrix\n",
    "    mlflow.log_figure(cm.figure_,artifact_file='confusion_matrix.png')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6db5e7a5-486f-4fb1-9070-77db2af3e98a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}