{ "cells": [ { "cell_type": "markdown", "id": "7dd3aed1-8c77-491a-beb4-6658b3e603b6", "metadata": {}, "source": [ "# Import Packages" ] }, { "cell_type": "code", "execution_count": null, "id": "b1b9541c-7de1-4c89-9424-01058657d4b8", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn import set_config\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline, FeatureUnion\n", "\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import (\n", " StandardScaler,\n", " MinMaxScaler,\n", " OneHotEncoder,\n", " OrdinalEncoder\n", ")\n", "\n", "from feature_engine.encoding import CountFrequencyEncoder\n", "from feature_engine.outliers.winsorizer import Winsorizer\n", "\n", "import mlflow\n", "\n", "from sklearn.metrics import (\n", " accuracy_score, \n", " precision_score, \n", " recall_score, \n", " f1_score\n", ")\n", "\n", "from sklearn.metrics import ConfusionMatrixDisplay" ] }, { "cell_type": "markdown", "id": "0f44afcc-35a3-4e78-8b0f-1bff5cac2f42", "metadata": {}, "source": [ "# Load the Data" ] }, { "cell_type": "code", "execution_count": null, "id": "fc883d66-7142-451c-b7a7-a88407311855", "metadata": {}, "outputs": [], "source": [ "# read the csv file\n", "\n", "df = pd.read_csv(\"data/titanic.csv\")\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "74d95fa4-20c7-4e1a-a34a-438343bf1b89", "metadata": {}, "outputs": [], "source": [ "# check for missing values in data\n", "\n", "(\n", " df\n", " .isna()\n", " .sum()\n", ")" ] }, { "cell_type": "markdown", "id": "b4406de8-2796-471b-9b1d-37f324eb25fa", "metadata": {}, "source": [ "**Observations**:\n", "1. `Age`, `Emabrked` and `Cabin` columns have missing values." ] }, { "cell_type": "code", "execution_count": null, "id": "c73034ac-df11-42dd-8238-c7ff9de91979", "metadata": {}, "outputs": [], "source": [ "# info about the data\n", "\n", "df.info()" ] }, { "cell_type": "markdown", "id": "34bdfe67-8229-491e-b08f-2388aea5aab6", "metadata": {}, "source": [ "# Data CLeaning" ] }, { "cell_type": "code", "execution_count": null, "id": "2f67329d-b6f3-4486-8ca0-bebfac68d258", "metadata": {}, "outputs": [], "source": [ "# columns to drop\n", "\n", "columns_to_drop = ['passengerid','name','ticket','cabin']" ] }, { "cell_type": "code", "execution_count": null, "id": "eae542f3-ee1c-4e5f-8600-85a29a7ec48a", "metadata": {}, "outputs": [], "source": [ "def clean_data(df):\n", " return (\n", " df\n", " .rename(columns=str.lower)\n", " .drop(columns=columns_to_drop)\n", " .assign(\n", " family = lambda df_ : df_['sibsp'] + df_['parch']\n", " )\n", " .drop(columns=['sibsp','parch'])\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "4465d425-1dd4-49be-9b1b-d7876fb42277", "metadata": {}, "outputs": [], "source": [ "final_df = clean_data(df)\n", "\n", "final_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "37cef40c-628a-42a9-934a-ae3461d46853", "metadata": {}, "outputs": [], "source": [ "# shape of the cleaned data \n", "\n", "print(f'The cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns')" ] }, { "cell_type": "code", "execution_count": null, "id": "cebfd73f-5ede-4a17-be63-7355369997f7", "metadata": {}, "outputs": [], "source": [ "# missing values in the cleaned data\n", "\n", "(\n", " final_df\n", " .isna()\n", " .sum()\n", ")" ] }, { "cell_type": "markdown", "id": "087aedb7-b716-4d10-8e03-d9a9149e3c57", "metadata": {}, "source": [ "# EDA" ] }, { "cell_type": "code", "execution_count": null, "id": "075fc561-597a-48c8-9da4-718e1f0f21e0", "metadata": {}, "outputs": [], "source": [ "# distribution of target\n", "\n", "(\n", " final_df\n", " .loc[:,'survived']\n", " .value_counts(normalize=True)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "c414edaf-7749-4f0d-bc77-288f1846379e", "metadata": {}, "outputs": [], "source": [ "# boxplots\n", "\n", "def create_boxplot(data,column_name,hue=None):\n", " sns.boxplot(data=data, y=column_name, hue=hue)" ] }, { "cell_type": "code", "execution_count": null, "id": "053c8ad1-307a-4182-b798-aecd2e56e349", "metadata": {}, "outputs": [], "source": [ "# boxplot for age column\n", "create_boxplot(final_df,'age')" ] }, { "cell_type": "code", "execution_count": null, "id": "d4e6b0c1-beb6-4eb4-a1a3-e1ed297b7ac7", "metadata": {}, "outputs": [], "source": [ "# boxplot for fare column\n", "\n", "create_boxplot(final_df,'fare')" ] }, { "cell_type": "markdown", "id": "2fc3dc52-6c52-4cef-b40d-f8b3f2553882", "metadata": {}, "source": [ "**Overview**\n", "- Outliers in the age and fare columns" ] }, { "cell_type": "code", "execution_count": null, "id": "9eb075d8-c329-45ec-b311-c3ef16c55357", "metadata": {}, "outputs": [], "source": [ "# plot the distribution of categorical columns\n", "\n", "def plot_distribution(data,column_name):\n", " sns.countplot(data=data, x=column_name)" ] }, { "cell_type": "code", "execution_count": null, "id": "a8b1d684-37d7-445a-91cf-d017e5f1efa2", "metadata": {}, "outputs": [], "source": [ "# distribution for pclass\n", "plot_distribution(final_df,'pclass')" ] }, { "cell_type": "code", "execution_count": null, "id": "3ea410f0-8c0b-4281-acd8-9aecde4ee2d7", "metadata": {}, "outputs": [], "source": [ "# distribution for sex\n", "\n", "plot_distribution(final_df,'sex')" ] }, { "cell_type": "code", "execution_count": null, "id": "d758c8c4-5541-4dac-9696-b0e99dab3979", "metadata": {}, "outputs": [], "source": [ "# distribution for embarked \n", "\n", "plot_distribution(final_df,'embarked')" ] }, { "cell_type": "markdown", "id": "d7fff975-6e32-43bb-8ec6-6be0a39f5c1e", "metadata": {}, "source": [ "# Feature_Eng" ] }, { "cell_type": "code", "execution_count": null, "id": "110ea78a-d709-46bc-b6e7-dd813557bec8", "metadata": {}, "outputs": [], "source": [ "final_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "5c374064-e47c-40f0-baf7-54e0ff842560", "metadata": {}, "outputs": [], "source": [ "# make X and y\n", "\n", "X = final_df.drop(columns=['survived'])\n", "y = final_df['survived']" ] }, { "cell_type": "code", "execution_count": null, "id": "51861761-7ee7-4613-9992-2ddfaef05b53", "metadata": {}, "outputs": [], "source": [ "X.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "503e0bb6-af40-43d8-8614-8c56b5910ae3", "metadata": {}, "outputs": [], "source": [ "# do train test split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n", "\n", "print('The shape of training data is',X_train.shape)\n", "print('The shape of testing data is',X_test.shape)" ] }, { "cell_type": "markdown", "id": "970b2558-9fe4-4bf7-9d36-80775f1a640d", "metadata": {}, "source": [ "## Pipelines for Individual Columns" ] }, { "cell_type": "code", "execution_count": null, "id": "ce21c311-c9b5-48fb-9619-1c386b95b065", "metadata": {}, "outputs": [], "source": [ "# age_pipeline\n", "age_pipe = Pipeline(steps=[\n", " ('impute',SimpleImputer(strategy='median')),\n", " ('outliers',Winsorizer(capping_method='gaussian',fold=3)),\n", " ('scale',StandardScaler())\n", "])\n", "\n", "\n", "age_pipe" ] }, { "cell_type": "code", "execution_count": null, "id": "e9bc1761-c7d8-43ab-939e-ca1a84249af5", "metadata": {}, "outputs": [], "source": [ "# fare pipeline\n", "\n", "fare_pipe = Pipeline(steps=[\n", " ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),\n", " ('scale',StandardScaler())\n", "])\n", "\n", "fare_pipe" ] }, { "cell_type": "code", "execution_count": null, "id": "d588548f-ae54-43d3-8efe-16f34dd66954", "metadata": {}, "outputs": [], "source": [ "# embarked_pipeline\n", "\n", "embarked_pipe = Pipeline(steps=[\n", " ('impute',SimpleImputer(strategy='most_frequent')),\n", " ('count_encode',CountFrequencyEncoder(encoding_method='count')),\n", " ('scale',MinMaxScaler())\n", "])\n", "\n", "embarked_pipe" ] }, { "cell_type": "markdown", "id": "24838a6d-af02-44dc-abfc-addd714f7533", "metadata": {}, "source": [ "## Column Transformer" ] }, { "cell_type": "code", "execution_count": null, "id": "1af74974-3b86-49ea-b495-663d20edd0a0", "metadata": {}, "outputs": [], "source": [ "set_config(transform_output='pandas')" ] }, { "cell_type": "code", "execution_count": null, "id": "95f9b639-2194-4cdc-b565-9021eb933aaf", "metadata": {}, "outputs": [], "source": [ "# make column column transformer\n", "\n", "preprocessor = ColumnTransformer(transformers=[\n", " ('age',age_pipe,['age']),\n", " ('fare',fare_pipe,['fare']),\n", " ('embarked',embarked_pipe,['embarked']),\n", " ('sex',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['sex']),\n", " ('family',MinMaxScaler(),['family'])\n", "],remainder='passthrough',n_jobs=-1,force_int_remainder_cols=False)\n", "\n", "preprocessor" ] }, { "cell_type": "code", "execution_count": null, "id": "aa6aa741-afc3-449c-b75d-38a1bea32de6", "metadata": {}, "outputs": [], "source": [ "# fit and transform the training data\n", "\n", "preprocessor.fit_transform(X_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "9ad34e5a-43e4-4e81-b2bb-b92e2c0b90ca", "metadata": {}, "outputs": [], "source": [ "preprocessor.get_params()" ] }, { "cell_type": "markdown", "id": "898afc54-e717-4b3e-9142-c6235abdfe0a", "metadata": {}, "source": [ "# Model Pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "a5c5d60d-3746-46c1-b15b-0bc59f62a187", "metadata": {}, "outputs": [], "source": [ "# build the model pipeline\n", "\n", "model_params = {'bootstrap': True,\n", " 'ccp_alpha': 0.0,\n", " 'class_weight': None,\n", " 'criterion': 'gini',\n", " 'max_depth': 6,\n", " 'max_features': 'sqrt',\n", " 'max_leaf_nodes': None,\n", " 'max_samples': 0.8,\n", " 'min_impurity_decrease': 0.0,\n", " 'min_samples_leaf': 1,\n", " 'min_samples_split': 2,\n", " 'min_weight_fraction_leaf': 0.0,\n", " 'monotonic_cst': None,\n", " 'n_estimators': 300,\n", " 'n_jobs': -1,\n", " 'oob_score': False,\n", " 'random_state': 30,\n", " 'verbose': 0,\n", " 'warm_start': False}" ] }, { "cell_type": "code", "execution_count": null, "id": "b19559c5-53cb-4630-b64d-cbf2a1c9ca39", "metadata": {}, "outputs": [], "source": [ "model_pipe = Pipeline(steps=[\n", " ('preprocessor',preprocessor),\n", " ('clf',RandomForestClassifier(**model_params))\n", "])\n", "\n", "model_pipe" ] }, { "cell_type": "code", "execution_count": null, "id": "66876201-5959-45ca-9112-ef7d16bf66b5", "metadata": {}, "outputs": [], "source": [ "# fit the model on the training data\n", "\n", "model_pipe.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "eaf4ffb7-1763-4000-b9bc-3d2a8b776704", "metadata": {}, "outputs": [], "source": [ "# evaluate the model on the test data\n", "\n", "y_pred = model_pipe.predict(X_test)\n", "\n", "accuracy = accuracy_score(y_test,y_pred)\n", "precision = precision_score(y_test,y_pred).item()\n", "recall = recall_score(y_test,y_pred).item()\n", "f1 = f1_score(y_test,y_pred).item()" ] }, { "cell_type": "code", "execution_count": null, "id": "3b4d315f-690e-442e-b2f0-f1872e6ef579", "metadata": {}, "outputs": [], "source": [ "# metrics dict\n", "\n", "metrics = {\n", " 'accuracy': accuracy,\n", " 'precision': precision,\n", " 'recall': recall,\n", " 'f1_score': f1\n", "}\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "id": "0ba611a6-9d53-4e5a-ab68-7fc8cd615779", "metadata": {}, "outputs": [], "source": [ "# plot confusion matrix\n", "\n", "cm = ConfusionMatrixDisplay.from_predictions(y_test,y_pred)" ] }, { "cell_type": "markdown", "id": "d57486a5-e1e2-43c3-8090-b880b76bad74", "metadata": {}, "source": [ "# MLFlow Tracking code" ] }, { "cell_type": "code", "execution_count": null, "id": "25849a92-97bd-4f7e-a40b-4b593697080f", "metadata": {}, "outputs": [], "source": [ "model_pipe.get_params()" ] }, { "cell_type": "code", "execution_count": null, "id": "5cee3f45-97ee-4888-bff3-f0f59031d906", "metadata": {}, "outputs": [], "source": [ "X_test.join(y_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "f0e312f1-a1c8-491d-86d3-917296af16a8", "metadata": {}, "outputs": [], "source": [ "# set the uri for server\n", "\n", "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n", "\n", "mlflow.set_experiment(\"Mentos Zindagi\")\n", "\n", "with mlflow.start_run() as run:\n", " # log the data signature\n", " data_signature = mlflow.models.infer_signature(model_input=X_train,model_output=model_pipe.predict(X_train))\n", "\n", " # log preprocessor parameters\n", " mlflow.log_params(model_pipe.get_params())\n", "\n", " # log model metrics\n", " mlflow.log_metrics(metrics)\n", " \n", " # log the model\n", " mlflow.sklearn.log_model(sk_model=model_pipe,artifact_path=\"model.pkl\",signature=data_signature)\n", "\n", " # Get the model uri\n", " model_uri = mlflow.get_artifact_uri(\"model.pkl\")\n", " \n", " # # evaluate the model\n", " # evaluations = mlflow.models.evaluate(model=model_uri,\n", " # data=X_test.join(y_test),\n", " # targets='survived',\n", " # model_type=\"classifier\")\n", "\n", " # log the confusion matrix\n", " mlflow.log_figure(cm.figure_,artifact_file='confusion_matrix.png')" ] }, { "cell_type": "code", "execution_count": null, "id": "6db5e7a5-486f-4fb1-9070-77db2af3e98a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }