Spaces:

Prathamesh1420
/

Mlflow_with_docker

Sleeping

App Files Files Community

Prathamesh1420 commited on 28 days ago

Commit

e9063f9

verified ·

1 Parent(s): 087fbf6

Upload MLFlow Mentos Zindagi.ipynb

Browse files

Files changed (1) hide show

MLFlow Mentos Zindagi.ipynb +696 -0

MLFlow Mentos Zindagi.ipynb ADDED Viewed

	@@ -0,0 +1,696 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7dd3aed1-8c77-491a-beb4-6658b3e603b6",
+   "metadata": {},
+   "source": [
+    "# Import Packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1b9541c-7de1-4c89-9424-01058657d4b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn import set_config\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
+    "\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.preprocessing import (\n",
+    "    StandardScaler,\n",
+    "    MinMaxScaler,\n",
+    "    OneHotEncoder,\n",
+    "    OrdinalEncoder\n",
+    ")\n",
+    "\n",
+    "from feature_engine.encoding import CountFrequencyEncoder\n",
+    "from feature_engine.outliers.winsorizer import Winsorizer\n",
+    "\n",
+    "import mlflow\n",
+    "\n",
+    "from sklearn.metrics import (\n",
+    "    accuracy_score, \n",
+    "    precision_score, \n",
+    "    recall_score, \n",
+    "    f1_score\n",
+    ")\n",
+    "\n",
+    "from sklearn.metrics import ConfusionMatrixDisplay"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f44afcc-35a3-4e78-8b0f-1bff5cac2f42",
+   "metadata": {},
+   "source": [
+    "# Load the Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc883d66-7142-451c-b7a7-a88407311855",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read the csv file\n",
+    "\n",
+    "df = pd.read_csv(\"data/titanic.csv\")\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74d95fa4-20c7-4e1a-a34a-438343bf1b89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check for missing values in data\n",
+    "\n",
+    "(\n",
+    "    df\n",
+    "    .isna()\n",
+    "    .sum()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b4406de8-2796-471b-9b1d-37f324eb25fa",
+   "metadata": {},
+   "source": [
+    "**Observations**:\n",
+    "1. `Age`, `Emabrked` and `Cabin` columns have missing values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c73034ac-df11-42dd-8238-c7ff9de91979",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# info about the data\n",
+    "\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "34bdfe67-8229-491e-b08f-2388aea5aab6",
+   "metadata": {},
+   "source": [
+    "# Data CLeaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f67329d-b6f3-4486-8ca0-bebfac68d258",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# columns to drop\n",
+    "\n",
+    "columns_to_drop = ['passengerid','name','ticket','cabin']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eae542f3-ee1c-4e5f-8600-85a29a7ec48a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_data(df):\n",
+    "    return (\n",
+    "        df\n",
+    "        .rename(columns=str.lower)\n",
+    "        .drop(columns=columns_to_drop)\n",
+    "        .assign(\n",
+    "            family = lambda df_ : df_['sibsp'] + df_['parch']\n",
+    "        )\n",
+    "        .drop(columns=['sibsp','parch'])\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4465d425-1dd4-49be-9b1b-d7876fb42277",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_df = clean_data(df)\n",
+    "\n",
+    "final_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37cef40c-628a-42a9-934a-ae3461d46853",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# shape of the cleaned data \n",
+    "\n",
+    "print(f'The cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cebfd73f-5ede-4a17-be63-7355369997f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# missing values in the cleaned data\n",
+    "\n",
+    "(\n",
+    "    final_df\n",
+    "    .isna()\n",
+    "    .sum()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "087aedb7-b716-4d10-8e03-d9a9149e3c57",
+   "metadata": {},
+   "source": [
+    "# EDA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "075fc561-597a-48c8-9da4-718e1f0f21e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# distribution of target\n",
+    "\n",
+    "(\n",
+    "    final_df\n",
+    "    .loc[:,'survived']\n",
+    "    .value_counts(normalize=True)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c414edaf-7749-4f0d-bc77-288f1846379e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# boxplots\n",
+    "\n",
+    "def create_boxplot(data,column_name,hue=None):\n",
+    "    sns.boxplot(data=data, y=column_name, hue=hue)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "053c8ad1-307a-4182-b798-aecd2e56e349",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# boxplot for age column\n",
+    "create_boxplot(final_df,'age')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4e6b0c1-beb6-4eb4-a1a3-e1ed297b7ac7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# boxplot for fare column\n",
+    "\n",
+    "create_boxplot(final_df,'fare')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2fc3dc52-6c52-4cef-b40d-f8b3f2553882",
+   "metadata": {},
+   "source": [
+    "**Overview**\n",
+    "- Outliers in the age and fare columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9eb075d8-c329-45ec-b311-c3ef16c55357",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot the distribution of categorical columns\n",
+    "\n",
+    "def plot_distribution(data,column_name):\n",
+    "    sns.countplot(data=data, x=column_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8b1d684-37d7-445a-91cf-d017e5f1efa2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# distribution for pclass\n",
+    "plot_distribution(final_df,'pclass')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ea410f0-8c0b-4281-acd8-9aecde4ee2d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# distribution for sex\n",
+    "\n",
+    "plot_distribution(final_df,'sex')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d758c8c4-5541-4dac-9696-b0e99dab3979",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# distribution for embarked \n",
+    "\n",
+    "plot_distribution(final_df,'embarked')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7fff975-6e32-43bb-8ec6-6be0a39f5c1e",
+   "metadata": {},
+   "source": [
+    "# Feature_Eng"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "110ea78a-d709-46bc-b6e7-dd813557bec8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c374064-e47c-40f0-baf7-54e0ff842560",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make X and y\n",
+    "\n",
+    "X = final_df.drop(columns=['survived'])\n",
+    "y = final_df['survived']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51861761-7ee7-4613-9992-2ddfaef05b53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "503e0bb6-af40-43d8-8614-8c56b5910ae3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# do train test split\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n",
+    "\n",
+    "print('The shape of training data is',X_train.shape)\n",
+    "print('The shape of testing data is',X_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "970b2558-9fe4-4bf7-9d36-80775f1a640d",
+   "metadata": {},
+   "source": [
+    "## Pipelines for Individual Columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce21c311-c9b5-48fb-9619-1c386b95b065",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# age_pipeline\n",
+    "age_pipe = Pipeline(steps=[\n",
+    "    ('impute',SimpleImputer(strategy='median')),\n",
+    "    ('outliers',Winsorizer(capping_method='gaussian',fold=3)),\n",
+    "    ('scale',StandardScaler())\n",
+    "])\n",
+    "\n",
+    "\n",
+    "age_pipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9bc1761-c7d8-43ab-939e-ca1a84249af5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fare pipeline\n",
+    "\n",
+    "fare_pipe = Pipeline(steps=[\n",
+    "    ('outliers',Winsorizer(capping_method='iqr',fold=1.5)),\n",
+    "    ('scale',StandardScaler())\n",
+    "])\n",
+    "\n",
+    "fare_pipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d588548f-ae54-43d3-8efe-16f34dd66954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# embarked_pipeline\n",
+    "\n",
+    "embarked_pipe = Pipeline(steps=[\n",
+    "    ('impute',SimpleImputer(strategy='most_frequent')),\n",
+    "    ('count_encode',CountFrequencyEncoder(encoding_method='count')),\n",
+    "    ('scale',MinMaxScaler())\n",
+    "])\n",
+    "\n",
+    "embarked_pipe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24838a6d-af02-44dc-abfc-addd714f7533",
+   "metadata": {},
+   "source": [
+    "## Column Transformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1af74974-3b86-49ea-b495-663d20edd0a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_config(transform_output='pandas')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95f9b639-2194-4cdc-b565-9021eb933aaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make column column transformer\n",
+    "\n",
+    "preprocessor = ColumnTransformer(transformers=[\n",
+    "    ('age',age_pipe,['age']),\n",
+    "    ('fare',fare_pipe,['fare']),\n",
+    "    ('embarked',embarked_pipe,['embarked']),\n",
+    "    ('sex',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['sex']),\n",
+    "    ('family',MinMaxScaler(),['family'])\n",
+    "],remainder='passthrough',n_jobs=-1,force_int_remainder_cols=False)\n",
+    "\n",
+    "preprocessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa6aa741-afc3-449c-b75d-38a1bea32de6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fit and transform the training data\n",
+    "\n",
+    "preprocessor.fit_transform(X_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ad34e5a-43e4-4e81-b2bb-b92e2c0b90ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocessor.get_params()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "898afc54-e717-4b3e-9142-c6235abdfe0a",
+   "metadata": {},
+   "source": [
+    "# Model Pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5c5d60d-3746-46c1-b15b-0bc59f62a187",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build the model pipeline\n",
+    "\n",
+    "model_params = {'bootstrap': True,\n",
+    " 'ccp_alpha': 0.0,\n",
+    " 'class_weight': None,\n",
+    " 'criterion': 'gini',\n",
+    " 'max_depth': 6,\n",
+    " 'max_features': 'sqrt',\n",
+    " 'max_leaf_nodes': None,\n",
+    " 'max_samples': 0.8,\n",
+    " 'min_impurity_decrease': 0.0,\n",
+    " 'min_samples_leaf': 1,\n",
+    " 'min_samples_split': 2,\n",
+    " 'min_weight_fraction_leaf': 0.0,\n",
+    " 'monotonic_cst': None,\n",
+    " 'n_estimators': 300,\n",
+    " 'n_jobs': -1,\n",
+    " 'oob_score': False,\n",
+    " 'random_state': 30,\n",
+    " 'verbose': 0,\n",
+    " 'warm_start': False}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b19559c5-53cb-4630-b64d-cbf2a1c9ca39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_pipe = Pipeline(steps=[\n",
+    "    ('preprocessor',preprocessor),\n",
+    "    ('clf',RandomForestClassifier(**model_params))\n",
+    "])\n",
+    "\n",
+    "model_pipe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66876201-5959-45ca-9112-ef7d16bf66b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fit the model on the training data\n",
+    "\n",
+    "model_pipe.fit(X_train,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaf4ffb7-1763-4000-b9bc-3d2a8b776704",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# evaluate the model on the test data\n",
+    "\n",
+    "y_pred = model_pipe.predict(X_test)\n",
+    "\n",
+    "accuracy = accuracy_score(y_test,y_pred)\n",
+    "precision = precision_score(y_test,y_pred).item()\n",
+    "recall = recall_score(y_test,y_pred).item()\n",
+    "f1 = f1_score(y_test,y_pred).item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b4d315f-690e-442e-b2f0-f1872e6ef579",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# metrics dict\n",
+    "\n",
+    "metrics = {\n",
+    "    'accuracy': accuracy,\n",
+    "    'precision': precision,\n",
+    "    'recall': recall,\n",
+    "    'f1_score': f1\n",
+    "}\n",
+    "\n",
+    "metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ba611a6-9d53-4e5a-ab68-7fc8cd615779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot confusion matrix\n",
+    "\n",
+    "cm = ConfusionMatrixDisplay.from_predictions(y_test,y_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d57486a5-e1e2-43c3-8090-b880b76bad74",
+   "metadata": {},
+   "source": [
+    "# MLFlow Tracking code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25849a92-97bd-4f7e-a40b-4b593697080f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_pipe.get_params()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cee3f45-97ee-4888-bff3-f0f59031d906",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_test.join(y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0e312f1-a1c8-491d-86d3-917296af16a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set the uri for server\n",
+    "\n",
+    "mlflow.set_tracking_uri(\"http://127.0.0.1:8080\")\n",
+    "\n",
+    "mlflow.set_experiment(\"Mentos Zindagi\")\n",
+    "\n",
+    "with mlflow.start_run() as run:\n",
+    "    # log the data signature\n",
+    "    data_signature = mlflow.models.infer_signature(model_input=X_train,model_output=model_pipe.predict(X_train))\n",
+    "\n",
+    "    # log preprocessor parameters\n",
+    "    mlflow.log_params(model_pipe.get_params())\n",
+    "\n",
+    "    # log model metrics\n",
+    "    mlflow.log_metrics(metrics)\n",
+    "    \n",
+    "    # log the model\n",
+    "    mlflow.sklearn.log_model(sk_model=model_pipe,artifact_path=\"model.pkl\",signature=data_signature)\n",
+    "\n",
+    "    # Get the model uri\n",
+    "    model_uri = mlflow.get_artifact_uri(\"model.pkl\")\n",
+    "    \n",
+    "    # # evaluate the model\n",
+    "    # evaluations = mlflow.models.evaluate(model=model_uri,\n",
+    "    #                        data=X_test.join(y_test),\n",
+    "    #                        targets='survived',\n",
+    "    #                        model_type=\"classifier\")\n",
+    "\n",
+    "    # log the confusion matrix\n",
+    "    mlflow.log_figure(cm.figure_,artifact_file='confusion_matrix.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6db5e7a5-486f-4fb1-9070-77db2af3e98a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}