{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Delete the lines with a brown background color in the excel files\n",
    "The excel files are located in the Data/Classification/labeled_data folder of the MESCnn repository."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from openpyxl import Workbook, load_workbook\n",
    "import os \n",
    "\n",
    "path_to_excel = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n",
    "\n",
    "# Function to get the RGB value of a color\n",
    "def get_rgb(color):\n",
    "    return tuple(int(color[i:i+2], 16) for i in (0, 2, 4))\n",
    "\n",
    "for file in os.listdir(path_to_excel):\n",
    "    if file.endswith(\".xlsx\") or file.endswith(\".XLSX\"):\n",
    "        file = os.path.join(path_to_excel, file)\n",
    "        # Load the workbook\n",
    "        workbook = load_workbook(file)\n",
    "        \n",
    "        # Select the first sheet\n",
    "        sheet = workbook.active\n",
    " \n",
    "        # Create a new workbook\n",
    "        new_workbook = Workbook()\n",
    "        new_sheet = new_workbook.active\n",
    "        \n",
    "        # List to store rows with RGB colors\n",
    "        rows_with_rgb = []\n",
    "        \n",
    "        # Iterate through each row\n",
    "        for row_idx, row in enumerate(sheet.iter_rows(), start=1):\n",
    "            row_colors = []\n",
    "            has_rgb_color = False  # Flag to check if row has any RGB color\n",
    "            # Iterate through each cell in the row\n",
    "            for cell in row:\n",
    "                fill = cell.fill\n",
    "                if fill.start_color.type == 'rgb':\n",
    "                    rgb_value = get_rgb(fill.start_color.rgb)\n",
    "                    row_colors.append(rgb_value)\n",
    "                    has_rgb_color = True\n",
    "            # Check if the row has at least one RGB color\n",
    "            if has_rgb_color:\n",
    "                rows_with_rgb.append(row)\n",
    " \n",
    "        # Write rows with RGB colors to the new workbook\n",
    "        for row in rows_with_rgb:\n",
    "            new_sheet.append([cell.value for cell in row])\n",
    "        \n",
    "        # Save the new workbook\n",
    "        new_workbook.save(file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract labeled data from excel files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C1104066_JGI.XLSX\n",
      "C1105034_JGI.XLSX\n",
      "C1110748_JGI.xlsx\n",
      "C1112141_JGI.XLSX\n",
      "C1105798_JGI.xlsx\n",
      "C1117893_JGI.xlsx\n",
      "C1107892_JGI.xlsx\n",
      "C1107752_JGI.xlsx\n",
      "C1105642_JGI.XLSX\n",
      "                                          Patch names          M    E    S  \\\n",
      "0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg          0    0    1   \n",
      "1   glomerulus C1104066 [142336, 49680, 744, 640]....          0    0  GGS   \n",
      "2   glomerulus C1104066 [142772, 48280, 1100, 864]...          1    0    0   \n",
      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg          0    0  GGS   \n",
      "4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg          0    0    1   \n",
      "..                                                ...        ...  ...  ...   \n",
      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg  nan_label  noE  GGS   \n",
      "48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg          1    0    1   \n",
      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg  nan_label  noE  GGS   \n",
      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg  nan_label  noE  GGS   \n",
      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg  nan_label  noE  GGS   \n",
      "\n",
      "      C  \n",
      "0     0  \n",
      "1     0  \n",
      "2     0  \n",
      "3     0  \n",
      "4     0  \n",
      "..  ...  \n",
      "47  noC  \n",
      "48    0  \n",
      "49  noC  \n",
      "50  noC  \n",
      "51  noC  \n",
      "\n",
      "[470 rows x 5 columns]\n",
      "(470, 5)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "   \n",
    "# Set the path to the labeled data directory\n",
    "labeled_data_dir = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n",
    "\n",
    "# Get the list of excel files in the labeled data directory\n",
    "excel_files = [file for file in os.listdir(labeled_data_dir) if file.endswith(\".xlsx\") or file.endswith(\".XLSX\")]\n",
    "\n",
    "# Create an empty dataframe\n",
    "df_combined = pd.DataFrame(columns=[\"Patch names\", \"M\", \"E\", \"S\", \"C\"])\n",
    "\n",
    "# Iterate over the excel files\n",
    "for file in excel_files:\n",
    "    print(file)\n",
    "    # Read the excel file\n",
    "    df = pd.read_excel(os.path.join(labeled_data_dir, file))\n",
    "    \n",
    "    if file == \"C1107752_JGI.xlsx\": # This file raises an error for a reason I don't understand\n",
    "        corrected_index = 61  \n",
    "    else:\n",
    "        # Find the index of the row with \"CORRECTED\" or \"Corrected\" value in the first column\n",
    "        if (df.iloc[:, 0] == \"CORRECTED\").any():\n",
    "            corrected_index = df[df.iloc[:, 0] == \"CORRECTED\"].index[0]\n",
    "        elif (df.iloc[:, 0] == \"Corrected\").any():\n",
    "            corrected_index = df[df.iloc[:, 0] == \"Corrected\"].index[0]\n",
    "        elif (df.iloc[:, 0] == \"CORRECTED JGI\").any():\n",
    "            corrected_index = df[df.iloc[:, 0] == \"CORRECTED JGI\"].index[0]\n",
    "        else:\n",
    "            corrected_index = df[df.iloc[:, 0] == \"filename\"].index[0]        \n",
    "        \n",
    "    # Skip the rows before the \"CORRECTED\" row and select the following rows\n",
    "    df = df.iloc[corrected_index + 1:]\n",
    "    \n",
    "    # Get the values in the M, E, S, and C columns\n",
    "    m_values = df[\"M\"].values\n",
    "    e_values = df[\"E\"].values\n",
    "    s_values = df[\"S\"].values\n",
    "    c_values = df[\"C\"].values\n",
    "    \n",
    "    # Get the name of each patch in the Patch_name column\n",
    "    patch_names = df[\"filename\"].values\n",
    "    \n",
    "    # Split the patch names to keep only the part after the last '\\'\n",
    "    patch_names = [name.split('\\\\')[-1] for name in patch_names]\n",
    "    \n",
    "    # Create a dataframe for the current file\n",
    "    df_current = pd.DataFrame({\n",
    "        \"Patch names\": patch_names,\n",
    "        \"M\": m_values,\n",
    "        \"E\": e_values,\n",
    "        \"S\": s_values,\n",
    "        \"C\": c_values\n",
    "    })\n",
    "    \n",
    "    # Append the current dataframe to the combined dataframe\n",
    "    df_combined = pd.concat([df_combined, df_current])\n",
    "\n",
    "# Print the combined dataframe\n",
    "print(df_combined)\n",
    "print(df_combined.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                          Patch names          M    E     S  \\\n",
      "0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg        noM  noE   SGS   \n",
      "1   glomerulus C1104066 [142336, 49680, 744, 640]....        noM  noE   GGS   \n",
      "2   glomerulus C1104066 [142772, 48280, 1100, 864]...       yesM  noE  NoGS   \n",
      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg        noM  noE   GGS   \n",
      "4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg        noM  noE   SGS   \n",
      "..                                                ...        ...  ...   ...   \n",
      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg  nan_label  noE   GGS   \n",
      "48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg       yesM  noE   SGS   \n",
      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg  nan_label  noE   GGS   \n",
      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg  nan_label  noE   GGS   \n",
      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg  nan_label  noE   GGS   \n",
      "\n",
      "      C  \n",
      "0   noC  \n",
      "1   noC  \n",
      "2   noC  \n",
      "3   noC  \n",
      "4   noC  \n",
      "..  ...  \n",
      "47  noC  \n",
      "48  noC  \n",
      "49  noC  \n",
      "50  noC  \n",
      "51  noC  \n",
      "\n",
      "[470 rows x 5 columns]\n"
     ]
    }
   ],
   "source": [
    "mesc_def = {\n",
    "    \"M\": {\n",
    "        0: \"noM\",\n",
    "        1: \"yesM\",\n",
    "    },\n",
    "    \"E\": {\n",
    "        0: \"noE\",\n",
    "        1: \"yesE\"\n",
    "    },\n",
    "    \"S\": {\n",
    "        \"GGS\": \"GGS\",\n",
    "        0: \"NoGS\",\n",
    "        1: \"SGS\"\n",
    "    },\n",
    "    \"C\": {\n",
    "        0: \"noC\",\n",
    "        1: \"yesC\"\n",
    "    }\n",
    "}\n",
    "df_combined[\"M\"] = df_combined[\"M\"].replace(mesc_def[\"M\"])\n",
    "df_combined[\"E\"] = df_combined[\"E\"].replace(mesc_def[\"E\"])\n",
    "df_combined[\"S\"] = df_combined[\"S\"].replace(mesc_def[\"S\"])\n",
    "df_combined[\"C\"] = df_combined[\"C\"].replace(mesc_def[\"C\"])\n",
    "print(df_combined)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['yesE', 'noM', 'noE', 'NoGS', 10, 'yesC', 'noC', 'yesM', 'SGS', 'GGS', nan, 'nan_label']\n",
      "                                          Patch names     M    E     S    C\n",
      "0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg   noM  noE   SGS  noC\n",
      "1   glomerulus C1104066 [142336, 49680, 744, 640]....   NaN  NaN   GGS  NaN\n",
      "2   glomerulus C1104066 [142772, 48280, 1100, 864]...  yesM  noE  NoGS  noC\n",
      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg   NaN  NaN   GGS  NaN\n",
      "4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg   noM  noE   SGS  noC\n",
      "..                                                ...   ...  ...   ...  ...\n",
      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg   NaN  NaN   GGS  NaN\n",
      "48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg  yesM  noE   SGS  noC\n",
      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg   NaN  NaN   GGS  NaN\n",
      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg   NaN  NaN   GGS  NaN\n",
      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg   NaN  NaN   GGS  NaN\n",
      "\n",
      "[470 rows x 5 columns]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "labels = df_combined[['M', 'E', 'S', 'C']].values.flatten()\n",
    "distinct_labels = list(set(labels))\n",
    "print(distinct_labels)\n",
    "\n",
    "possible_labels = [\"noM\", \"yesM\", \"noE\", \"yesE\", \"GGS\", \"NoGS\", \"SGS\", \"noC\", \"yesC\", \"nan_label\"]\n",
    "\n",
    "# Replace values that are not in the possible_labels list with NaN\n",
    "df_combined.loc[:, 'M':'C'] = df_combined.loc[:, 'M':'C'].apply(lambda x: np.where(x.isin(possible_labels), x, np.nan))\n",
    "\n",
    "# If the value in the S column is \"GGS\", set the value in the other columns to NaN\n",
    "df_combined.loc[df_combined[\"S\"] == \"GGS\", [\"M\", \"E\", \"C\"]] = np.nan\n",
    "\n",
    "# Print the updated dataframe\n",
    "print(df_combined)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                          Patch names     M    E     S    C\n",
      "1   glomerulus C1104066 [142336, 49680, 744, 640]....   NaN  NaN   GGS  NaN\n",
      "3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg   NaN  NaN   GGS  NaN\n",
      "7    glomerulus C1104066 [8044, 62252, 752, 796].jpeg   NaN  NaN   GGS  NaN\n",
      "15  glomerulus C1104066 [94652, 48228, 636, 644].jpeg   NaN  NaN   GGS  NaN\n",
      "17  glomerulus C1105034 [150832, 29052, 600, 496]....   NaN  NaN   GGS  NaN\n",
      "9   glomerulus C1110748 [129452, 5728, 708, 512].jpeg   NaN  NaN   GGS  NaN\n",
      "19  glomerulus C1110748 [134904, 7652, 776, 692].jpeg   NaN  NaN   GGS  NaN\n",
      "22  glomerulus C1110748 [136192, 55140, 788, 688]....   NaN  NaN   GGS  NaN\n",
      "25  glomerulus C1110748 [145592, 41936, 740, 640]....   NaN  NaN   GGS  NaN\n",
      "40  glomerulus C1110748 [154628, 24972, 804, 684]....   NaN  NaN   GGS  NaN\n",
      "41  glomerulus C1110748 [155592, 25764, 648, 612]....   NaN  NaN   GGS  NaN\n",
      "46  glomerulus C1110748 [156748, 71428, 812, 692]....   NaN  NaN   GGS  NaN\n",
      "48  glomerulus C1110748 [157812, 72180, 600, 536]....   NaN  NaN   GGS  NaN\n",
      "36  glomerulus C1112141 [78580, 16560, 656, 788].jpeg   NaN  NaN   GGS  NaN\n",
      "43  glomerulus C1112141 [82724, 17252, 860, 808].jpeg   NaN  NaN   GGS  NaN\n",
      "46  glomerulus C1112141 [83852, 19840, 884, 944].jpeg  yesM  NaN  NoGS  noC\n",
      "48  glomerulus C1112141 [86140, 60432, 720, 776].jpeg   NaN  NaN   GGS  NaN\n",
      "50  glomerulus C1112141 [87964, 20760, 672, 732].jpeg   NaN  NaN   GGS  NaN\n",
      "55  glomerulus C1112141 [90196, 61504, 848, 804].jpeg   NaN  NaN   GGS  NaN\n",
      "58  glomerulus C1112141 [95092, 65612, 680, 668].jpeg   NaN  NaN   GGS  NaN\n",
      "4   glomerulus C1105798 [118952, 9668, 980, 896].jpeg   NaN  NaN   GGS  NaN\n",
      "6   glomerulus C1105798 [120488, 15428, 684, 516]....   NaN  NaN   GGS  NaN\n",
      "14  glomerulus C1105798 [129104, 54064, 708, 576]....   NaN  NaN   GGS  NaN\n",
      "54  glomerulus C1105798 [76196, 61668, 740, 968].jpeg   NaN  NaN   GGS  NaN\n",
      "28  glomerulus C1117893 [26068, 32092, 724, 708].jpeg   NaN  NaN   GGS  NaN\n",
      "32  glomerulus C1117893 [31252, 77564, 700, 696].jpeg   NaN  NaN   GGS  NaN\n",
      "33  glomerulus C1117893 [65224, 17120, 528, 544].jpeg   NaN  NaN   GGS  NaN\n",
      "11  glomerulus C1107892 [126480, 27244, 588, 564]....   NaN  NaN   GGS  NaN\n",
      "43  glomerulus C1107892 [75916, 26668, 564, 572].jpeg   NaN  NaN   GGS  NaN\n",
      "44  glomerulus C1107892 [76200, 75040, 508, 576].jpeg   NaN  NaN   GGS  NaN\n",
      "48  glomerulus C1107892 [77772, 25272, 740, 760].jpeg   NaN  NaN   GGS  NaN\n",
      "49  glomerulus C1107892 [77980, 73584, 732, 724].jpeg   NaN  NaN   GGS  NaN\n",
      "55  glomerulus C1107892 [80568, 69696, 616, 644].jpeg   NaN  NaN   GGS  NaN\n",
      "56  glomerulus C1107892 [80608, 21544, 624, 660].jpeg   NaN  NaN   GGS  NaN\n",
      "11  glomerulus C1105642 [136108, 72452, 612, 532]....   NaN  NaN   GGS  NaN\n",
      "12  glomerulus C1105642 [136892, 73056, 596, 540]....   NaN  NaN   GGS  NaN\n",
      "13  glomerulus C1105642 [137860, 71816, 640, 728]....   NaN  NaN   GGS  NaN\n",
      "18  glomerulus C1105642 [140788, 20956, 616, 548]....   NaN  NaN   GGS  NaN\n",
      "19  glomerulus C1105642 [141656, 21460, 620, 576]....   NaN  NaN   GGS  NaN\n",
      "20  glomerulus C1105642 [142460, 20320, 540, 512]....   NaN  NaN   GGS  NaN\n",
      "22  glomerulus C1105642 [14640, 21940, 524, 584].jpeg   NaN  NaN   GGS  NaN\n",
      "29  glomerulus C1105642 [64876, 12060, 596, 648].jpeg   NaN  NaN   GGS  NaN\n",
      "33  glomerulus C1105642 [67600, 62876, 656, 680].jpeg   NaN  NaN   GGS  NaN\n",
      "35  glomerulus C1105642 [68388, 15580, 644, 604].jpeg   NaN  NaN   GGS  NaN\n",
      "40  glomerulus C1105642 [70972, 66596, 652, 628].jpeg   NaN  NaN   GGS  NaN\n",
      "41  glomerulus C1105642 [71324, 17312, 560, 556].jpeg   NaN  NaN   GGS  NaN\n",
      "46  glomerulus C1105642 [72752, 20572, 620, 524].jpeg   NaN  NaN   GGS  NaN\n",
      "47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg   NaN  NaN   GGS  NaN\n",
      "49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg   NaN  NaN   GGS  NaN\n",
      "50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg   NaN  NaN   GGS  NaN\n",
      "51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg   NaN  NaN   GGS  NaN\n"
     ]
    }
   ],
   "source": [
    "nan_rows = df_combined[df_combined.isnull().any(axis=1)]\n",
    "print(nan_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Patch names</th>\n",
       "      <th>M</th>\n",
       "      <th>E</th>\n",
       "      <th>S</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>glomerulus C1107752 [130360, 32956, 1020, 1008...</td>\n",
       "      <td>yesM</td>\n",
       "      <td>yesE</td>\n",
       "      <td>NoGS</td>\n",
       "      <td>yesC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>glomerulus C1107752 [135308, 69504, 1012, 1004...</td>\n",
       "      <td>yesM</td>\n",
       "      <td>noE</td>\n",
       "      <td>NoGS</td>\n",
       "      <td>yesC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>glomerulus C1107752 [137584, 31764, 836, 872]....</td>\n",
       "      <td>yesM</td>\n",
       "      <td>noE</td>\n",
       "      <td>NoGS</td>\n",
       "      <td>yesC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>glomerulus C1107752 [87436, 35528, 724, 844].jpeg</td>\n",
       "      <td>yesM</td>\n",
       "      <td>noE</td>\n",
       "      <td>NoGS</td>\n",
       "      <td>yesC</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>glomerulus C1105642 [120200, 56808, 1304, 1140...</td>\n",
       "      <td>yesM</td>\n",
       "      <td>noE</td>\n",
       "      <td>SGS</td>\n",
       "      <td>yesC</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          Patch names     M     E     S     C\n",
       "1   glomerulus C1107752 [130360, 32956, 1020, 1008...  yesM  yesE  NoGS  yesC\n",
       "6   glomerulus C1107752 [135308, 69504, 1012, 1004...  yesM   noE  NoGS  yesC\n",
       "10  glomerulus C1107752 [137584, 31764, 836, 872]....  yesM   noE  NoGS  yesC\n",
       "39  glomerulus C1107752 [87436, 35528, 724, 844].jpeg  yesM   noE  NoGS  yesC\n",
       "2   glomerulus C1105642 [120200, 56808, 1304, 1140...  yesM   noE   SGS  yesC"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print the rows with yesC in the C column\n",
    "yesC_rows = df_combined[df_combined[\"C\"] == \"yesC\"]\n",
    "yesC_rows"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Separate the patches into train and val sets \n",
    "Test set needs to be added but we didn't have enough data so we decided to use the validation set as the test set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Seed is -828\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WSI images have been split into train and val folders.\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import shutil\n",
    "import sys\n",
    "\n",
    "# Set the path to the Crop-256 folder\n",
    "crop256_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Crops\"\n",
    "\n",
    "# Set the path to the Data/Classification folder\n",
    "dataset_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification\"\n",
    "\n",
    "# Set the train and val ratio\n",
    "train_ratio = 0.7\n",
    "val_ratio = 0.3\n",
    "\n",
    "# Create the train and val folders\n",
    "train_folder = os.path.join(dataset_folder, \"train\")\n",
    "val_folder = os.path.join(dataset_folder, \"val\")\n",
    "os.makedirs(train_folder, exist_ok=True)\n",
    "os.makedirs(val_folder, exist_ok=True)\n",
    "\n",
    "# If the train and val folders are not empty, ask the user to confirm if they want to overwrite the folders\n",
    "if len(os.listdir(train_folder)) > 0 or len(os.listdir(val_folder)) > 0:\n",
    "    response = input(\"The train and val folders are not empty. Do you want to overwrite the folders? (yes/no): \")\n",
    "    if response.lower() != \"yes\":\n",
    "        print(\"Exiting the script.\")\n",
    "        sys.exit()\n",
    "    if response.lower() == \"yes\":\n",
    "        # Remove the existing folders\n",
    "        shutil.rmtree(train_folder)\n",
    "        shutil.rmtree(val_folder)\n",
    "        # Create the folders again\n",
    "        os.makedirs(train_folder, exist_ok=True)\n",
    "        os.makedirs(val_folder, exist_ok=True)\n",
    "        \n",
    "# Get the list of WSI folders in the Crop-256 folder\n",
    "wsi_folders = [wsi for wsi in os.listdir(crop256_folder)]\n",
    "\n",
    "# Shuffle the list of WSI images\n",
    "seed = random.randint(-1000, 1000)\n",
    "print(f\"Seed is {seed}\")\n",
    "random.seed(seed) # Allows for reproducibility\n",
    "\n",
    "imgs = []\n",
    "os.makedirs(os.path.join(train_folder), exist_ok=True)\n",
    "for wsi in wsi_folders:\n",
    "    # Copy the images to the train folder\n",
    "    for image in os.listdir(os.path.join(crop256_folder, wsi)):\n",
    "        src_path = os.path.join(crop256_folder, wsi, image)\n",
    "        dst_path = os.path.join(dataset_folder, image)\n",
    "        imgs.append(image)\n",
    "        shutil.copy(src_path, dst_path)\n",
    "\n",
    "# Shuffle the list of image paths\n",
    "random.seed(seed) # Allows for reproducibility\n",
    "random.shuffle(imgs)\n",
    "\n",
    "# Split the image paths into train and val sets\n",
    "train_size = int(train_ratio * len(imgs))\n",
    "train_imgs = imgs[:train_size]\n",
    "val_imgs = imgs[train_size:]\n",
    "\n",
    "# Copy the train images to the train folder\n",
    "os.makedirs(os.path.join(train_folder), exist_ok=True)\n",
    "# Copy the images to the train folder\n",
    "for image in train_imgs:\n",
    "    src_path = os.path.join(dataset_folder, image)\n",
    "    dst_path = os.path.join(train_folder, image)\n",
    "    shutil.copy(src_path, dst_path)\n",
    "        \n",
    "# Create the folder in the val folder\n",
    "os.makedirs(os.path.join(val_folder), exist_ok=True)\n",
    "# Copy the images to the val folder\n",
    "for image in val_imgs:\n",
    "    src_path = os.path.join(dataset_folder, image)\n",
    "    dst_path = os.path.join(val_folder, image)\n",
    "    shutil.copy(src_path, dst_path)\n",
    "\n",
    "# Remove the images from the dataset folder\n",
    "for image in imgs:\n",
    "    os.remove(os.path.join(dataset_folder, image))\n",
    "\n",
    "print(\"WSI images have been split into train and val folders.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sort the patches into their respective classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set the path to the train and val folders\n",
    "train_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/train\"\n",
    "val_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/val\"\n",
    "\n",
    "# Create new subdirectories for the labels in the train and val folders \n",
    "for label in possible_labels:\n",
    "    os.makedirs(os.path.join(train_folder, label), exist_ok=True)\n",
    "    os.makedirs(os.path.join(val_folder, label), exist_ok=True)\n",
    "    \n",
    "# Iterate over the rows in the df_combined dataframe\n",
    "for index, row in df_combined.iterrows():\n",
    "    # Get the labels of the current row\n",
    "    labels = row[[\"M\", \"E\", \"S\", \"C\"]]\n",
    "    \n",
    "    # Get the name of the current patch\n",
    "    patch_name = row[\"Patch names\"]\n",
    "    \n",
    "    # Set the source path of the image\n",
    "    if patch_name in os.listdir(train_folder):\n",
    "        source_path = os.path.join(train_folder, patch_name)\n",
    "    elif patch_name in os.listdir(val_folder):\n",
    "        source_path = os.path.join(val_folder, patch_name)\n",
    "    \n",
    "    # Set the destination paths of the image\n",
    "    for label in labels:\n",
    "        if label in possible_labels:\n",
    "            if source_path.split(\"/\")[-2] == \"train\":\n",
    "                dest_path = os.path.join(train_folder, label)\n",
    "            else:\n",
    "                dest_path = os.path.join(val_folder, label)\n",
    "            if patch_name in os.listdir(dest_path):\n",
    "                pass\n",
    "            else:\n",
    "                shutil.copy(source_path, dest_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Delete all the images in the train and val folders that are not in subdirectories\n",
    "for image in os.listdir(train_folder):\n",
    "    if os.path.isfile(os.path.join(train_folder, image)):\n",
    "        os.remove(os.path.join(train_folder, image))\n",
    "        \n",
    "for image in os.listdir(val_folder):\n",
    "    if os.path.isfile(os.path.join(val_folder, image)):\n",
    "        os.remove(os.path.join(val_folder, image))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create folders for each type of lesion\n",
    "lesion_folders = [\"M\", \"E\", \"S\", \"C\"]\n",
    "for lesion in lesion_folders:\n",
    "    lesion_path = os.path.join(dataset_folder, lesion)\n",
    "    os.makedirs(lesion_path, exist_ok=True)\n",
    "    for step in [\"train\", \"val\"]:\n",
    "        os.makedirs(os.path.join(lesion_path, step), exist_ok=True)\n",
    "        if lesion == \"M\":\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"nan_label\"), exist_ok=True)\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"noM\"), exist_ok=True)\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"yesM\"), exist_ok=True)\n",
    "        if lesion == \"E\":\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"noE\"), exist_ok=True)\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"yesE\"), exist_ok=True)\n",
    "        if lesion == \"S\":\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"GGS\"), exist_ok=True)\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"NoGS\"), exist_ok=True)\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"SGS\"), exist_ok=True)\n",
    "        if lesion == \"C\":\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"noC\"), exist_ok=True)\n",
    "            os.makedirs(os.path.join(lesion_path, step, \"yesC\"), exist_ok=True)\n",
    "            \n",
    "# Move the images to the appropriate folders\n",
    "lesion_labels_dict = {\n",
    "    \"M\": [\"nan_label\", \"noM\", \"yesM\"],\n",
    "    \"E\": [\"noE\", \"yesE\"],\n",
    "    \"S\": [\"GGS\", \"NoGS\", \"SGS\"],\n",
    "    \"C\": [\"noC\", \"yesC\"]\n",
    "}\n",
    "\n",
    "# Add the possibility to empty the folders if they are not empty\n",
    "for lesion in lesion_folders:\n",
    "    for step in [\"train\", \"val\"]:\n",
    "        for label in lesion_labels_dict[lesion]:\n",
    "            if len(os.listdir(os.path.join(dataset_folder, lesion, step, label))) > 0:\n",
    "                response = input(f\"The {lesion}/{step}/{label} folder is not empty. Do you want to empty the folder? (yes/no): \")\n",
    "                if response.lower() == \"yes\":\n",
    "                    shutil.rmtree(os.path.join(dataset_folder, lesion, step, label))\n",
    "                    os.makedirs(os.path.join(dataset_folder, lesion, step, label), exist_ok=True)\n",
    "                    \n",
    "# Move the images to the appropriate folders                  \n",
    "for lesion in lesion_labels_dict.keys():\n",
    "    for step in [\"train\", \"val\"]:\n",
    "        for label in lesion_labels_dict[lesion]:\n",
    "            source_folder = os.path.join(dataset_folder, step, label)\n",
    "            destination_folder = os.path.join(dataset_folder, lesion, step, label)\n",
    "            for image in os.listdir(source_folder):\n",
    "                source_path = os.path.join(source_folder, image)\n",
    "                destination_path = os.path.join(destination_folder, image)\n",
    "                shutil.move(source_path, destination_path)\n",
    "            os.rmdir(source_folder)\n",
    "\n",
    "os.rmdir(train_folder)\n",
    "os.rmdir(val_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "M: 416 images\n",
      "E: 414 images\n",
      "S: 465 images\n",
      "C: 415 images\n"
     ]
    }
   ],
   "source": [
    "# Give the amount of images by lesion\n",
    "for lesion in lesion_folders:\n",
    "    num_images = 0\n",
    "    for step in [\"train\", \"val\"]:\n",
    "        for label in lesion_labels_dict[lesion]:\n",
    "            num_images += len(os.listdir(os.path.join(dataset_folder, lesion, step, label)))\n",
    "    print(f\"{lesion}: {num_images} images\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "segmentation",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}