{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Delete the lines with a brown background color in the excel files\n", "The excel files are located in the Data/Classification/labeled_data folder of the MESCnn repository." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from openpyxl import Workbook, load_workbook\n", "import os \n", "\n", "path_to_excel = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n", "\n", "# Function to get the RGB value of a color\n", "def get_rgb(color):\n", " return tuple(int(color[i:i+2], 16) for i in (0, 2, 4))\n", "\n", "for file in os.listdir(path_to_excel):\n", " if file.endswith(\".xlsx\") or file.endswith(\".XLSX\"):\n", " file = os.path.join(path_to_excel, file)\n", " # Load the workbook\n", " workbook = load_workbook(file)\n", " \n", " # Select the first sheet\n", " sheet = workbook.active\n", " \n", " # Create a new workbook\n", " new_workbook = Workbook()\n", " new_sheet = new_workbook.active\n", " \n", " # List to store rows with RGB colors\n", " rows_with_rgb = []\n", " \n", " # Iterate through each row\n", " for row_idx, row in enumerate(sheet.iter_rows(), start=1):\n", " row_colors = []\n", " has_rgb_color = False # Flag to check if row has any RGB color\n", " # Iterate through each cell in the row\n", " for cell in row:\n", " fill = cell.fill\n", " if fill.start_color.type == 'rgb':\n", " rgb_value = get_rgb(fill.start_color.rgb)\n", " row_colors.append(rgb_value)\n", " has_rgb_color = True\n", " # Check if the row has at least one RGB color\n", " if has_rgb_color:\n", " rows_with_rgb.append(row)\n", " \n", " # Write rows with RGB colors to the new workbook\n", " for row in rows_with_rgb:\n", " new_sheet.append([cell.value for cell in row])\n", " \n", " # Save the new workbook\n", " new_workbook.save(file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract labeled data from excel files" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C1104066_JGI.XLSX\n", "C1105034_JGI.XLSX\n", "C1110748_JGI.xlsx\n", "C1112141_JGI.XLSX\n", "C1105798_JGI.xlsx\n", "C1117893_JGI.xlsx\n", "C1107892_JGI.xlsx\n", "C1107752_JGI.xlsx\n", "C1105642_JGI.XLSX\n", " Patch names M E S \\\n", "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg 0 0 1 \n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... 0 0 GGS \n", "2 glomerulus C1104066 [142772, 48280, 1100, 864]... 1 0 0 \n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg 0 0 GGS \n", "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg 0 0 1 \n", ".. ... ... ... ... \n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg nan_label noE GGS \n", "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg 1 0 1 \n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg nan_label noE GGS \n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg nan_label noE GGS \n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg nan_label noE GGS \n", "\n", " C \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", ".. ... \n", "47 noC \n", "48 0 \n", "49 noC \n", "50 noC \n", "51 noC \n", "\n", "[470 rows x 5 columns]\n", "(470, 5)\n" ] } ], "source": [ "import pandas as pd\n", " \n", "# Set the path to the labeled data directory\n", "labeled_data_dir = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n", "\n", "# Get the list of excel files in the labeled data directory\n", "excel_files = [file for file in os.listdir(labeled_data_dir) if file.endswith(\".xlsx\") or file.endswith(\".XLSX\")]\n", "\n", "# Create an empty dataframe\n", "df_combined = pd.DataFrame(columns=[\"Patch names\", \"M\", \"E\", \"S\", \"C\"])\n", "\n", "# Iterate over the excel files\n", "for file in excel_files:\n", " print(file)\n", " # Read the excel file\n", " df = pd.read_excel(os.path.join(labeled_data_dir, file))\n", " \n", " if file == \"C1107752_JGI.xlsx\": # This file raises an error for a reason I don't understand\n", " corrected_index = 61 \n", " else:\n", " # Find the index of the row with \"CORRECTED\" or \"Corrected\" value in the first column\n", " if (df.iloc[:, 0] == \"CORRECTED\").any():\n", " corrected_index = df[df.iloc[:, 0] == \"CORRECTED\"].index[0]\n", " elif (df.iloc[:, 0] == \"Corrected\").any():\n", " corrected_index = df[df.iloc[:, 0] == \"Corrected\"].index[0]\n", " elif (df.iloc[:, 0] == \"CORRECTED JGI\").any():\n", " corrected_index = df[df.iloc[:, 0] == \"CORRECTED JGI\"].index[0]\n", " else:\n", " corrected_index = df[df.iloc[:, 0] == \"filename\"].index[0] \n", " \n", " # Skip the rows before the \"CORRECTED\" row and select the following rows\n", " df = df.iloc[corrected_index + 1:]\n", " \n", " # Get the values in the M, E, S, and C columns\n", " m_values = df[\"M\"].values\n", " e_values = df[\"E\"].values\n", " s_values = df[\"S\"].values\n", " c_values = df[\"C\"].values\n", " \n", " # Get the name of each patch in the Patch_name column\n", " patch_names = df[\"filename\"].values\n", " \n", " # Split the patch names to keep only the part after the last '\\'\n", " patch_names = [name.split('\\\\')[-1] for name in patch_names]\n", " \n", " # Create a dataframe for the current file\n", " df_current = pd.DataFrame({\n", " \"Patch names\": patch_names,\n", " \"M\": m_values,\n", " \"E\": e_values,\n", " \"S\": s_values,\n", " \"C\": c_values\n", " })\n", " \n", " # Append the current dataframe to the combined dataframe\n", " df_combined = pd.concat([df_combined, df_current])\n", "\n", "# Print the combined dataframe\n", "print(df_combined)\n", "print(df_combined.shape)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Patch names M E S \\\n", "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg noM noE SGS \n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... noM noE GGS \n", "2 glomerulus C1104066 [142772, 48280, 1100, 864]... yesM noE NoGS \n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg noM noE GGS \n", "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg noM noE SGS \n", ".. ... ... ... ... \n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg nan_label noE GGS \n", "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg yesM noE SGS \n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg nan_label noE GGS \n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg nan_label noE GGS \n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg nan_label noE GGS \n", "\n", " C \n", "0 noC \n", "1 noC \n", "2 noC \n", "3 noC \n", "4 noC \n", ".. ... \n", "47 noC \n", "48 noC \n", "49 noC \n", "50 noC \n", "51 noC \n", "\n", "[470 rows x 5 columns]\n" ] } ], "source": [ "mesc_def = {\n", " \"M\": {\n", " 0: \"noM\",\n", " 1: \"yesM\",\n", " },\n", " \"E\": {\n", " 0: \"noE\",\n", " 1: \"yesE\"\n", " },\n", " \"S\": {\n", " \"GGS\": \"GGS\",\n", " 0: \"NoGS\",\n", " 1: \"SGS\"\n", " },\n", " \"C\": {\n", " 0: \"noC\",\n", " 1: \"yesC\"\n", " }\n", "}\n", "df_combined[\"M\"] = df_combined[\"M\"].replace(mesc_def[\"M\"])\n", "df_combined[\"E\"] = df_combined[\"E\"].replace(mesc_def[\"E\"])\n", "df_combined[\"S\"] = df_combined[\"S\"].replace(mesc_def[\"S\"])\n", "df_combined[\"C\"] = df_combined[\"C\"].replace(mesc_def[\"C\"])\n", "print(df_combined)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['yesE', 'noM', 'noE', 'NoGS', 10, 'yesC', 'noC', 'yesM', 'SGS', 'GGS', nan, 'nan_label']\n", " Patch names M E S C\n", "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg noM noE SGS noC\n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... NaN NaN GGS NaN\n", "2 glomerulus C1104066 [142772, 48280, 1100, 864]... yesM noE NoGS noC\n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg NaN NaN GGS NaN\n", "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg noM noE SGS noC\n", ".. ... ... ... ... ...\n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg NaN NaN GGS NaN\n", "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg yesM noE SGS noC\n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg NaN NaN GGS NaN\n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg NaN NaN GGS NaN\n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg NaN NaN GGS NaN\n", "\n", "[470 rows x 5 columns]\n" ] } ], "source": [ "import numpy as np\n", "labels = df_combined[['M', 'E', 'S', 'C']].values.flatten()\n", "distinct_labels = list(set(labels))\n", "print(distinct_labels)\n", "\n", "possible_labels = [\"noM\", \"yesM\", \"noE\", \"yesE\", \"GGS\", \"NoGS\", \"SGS\", \"noC\", \"yesC\", \"nan_label\"]\n", "\n", "# Replace values that are not in the possible_labels list with NaN\n", "df_combined.loc[:, 'M':'C'] = df_combined.loc[:, 'M':'C'].apply(lambda x: np.where(x.isin(possible_labels), x, np.nan))\n", "\n", "# If the value in the S column is \"GGS\", set the value in the other columns to NaN\n", "df_combined.loc[df_combined[\"S\"] == \"GGS\", [\"M\", \"E\", \"C\"]] = np.nan\n", "\n", "# Print the updated dataframe\n", "print(df_combined)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Patch names M E S C\n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... NaN NaN GGS NaN\n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg NaN NaN GGS NaN\n", "7 glomerulus C1104066 [8044, 62252, 752, 796].jpeg NaN NaN GGS NaN\n", "15 glomerulus C1104066 [94652, 48228, 636, 644].jpeg NaN NaN GGS NaN\n", "17 glomerulus C1105034 [150832, 29052, 600, 496].... NaN NaN GGS NaN\n", "9 glomerulus C1110748 [129452, 5728, 708, 512].jpeg NaN NaN GGS NaN\n", "19 glomerulus C1110748 [134904, 7652, 776, 692].jpeg NaN NaN GGS NaN\n", "22 glomerulus C1110748 [136192, 55140, 788, 688].... NaN NaN GGS NaN\n", "25 glomerulus C1110748 [145592, 41936, 740, 640].... NaN NaN GGS NaN\n", "40 glomerulus C1110748 [154628, 24972, 804, 684].... NaN NaN GGS NaN\n", "41 glomerulus C1110748 [155592, 25764, 648, 612].... NaN NaN GGS NaN\n", "46 glomerulus C1110748 [156748, 71428, 812, 692].... NaN NaN GGS NaN\n", "48 glomerulus C1110748 [157812, 72180, 600, 536].... NaN NaN GGS NaN\n", "36 glomerulus C1112141 [78580, 16560, 656, 788].jpeg NaN NaN GGS NaN\n", "43 glomerulus C1112141 [82724, 17252, 860, 808].jpeg NaN NaN GGS NaN\n", "46 glomerulus C1112141 [83852, 19840, 884, 944].jpeg yesM NaN NoGS noC\n", "48 glomerulus C1112141 [86140, 60432, 720, 776].jpeg NaN NaN GGS NaN\n", "50 glomerulus C1112141 [87964, 20760, 672, 732].jpeg NaN NaN GGS NaN\n", "55 glomerulus C1112141 [90196, 61504, 848, 804].jpeg NaN NaN GGS NaN\n", "58 glomerulus C1112141 [95092, 65612, 680, 668].jpeg NaN NaN GGS NaN\n", "4 glomerulus C1105798 [118952, 9668, 980, 896].jpeg NaN NaN GGS NaN\n", "6 glomerulus C1105798 [120488, 15428, 684, 516].... NaN NaN GGS NaN\n", "14 glomerulus C1105798 [129104, 54064, 708, 576].... NaN NaN GGS NaN\n", "54 glomerulus C1105798 [76196, 61668, 740, 968].jpeg NaN NaN GGS NaN\n", "28 glomerulus C1117893 [26068, 32092, 724, 708].jpeg NaN NaN GGS NaN\n", "32 glomerulus C1117893 [31252, 77564, 700, 696].jpeg NaN NaN GGS NaN\n", "33 glomerulus C1117893 [65224, 17120, 528, 544].jpeg NaN NaN GGS NaN\n", "11 glomerulus C1107892 [126480, 27244, 588, 564].... NaN NaN GGS NaN\n", "43 glomerulus C1107892 [75916, 26668, 564, 572].jpeg NaN NaN GGS NaN\n", "44 glomerulus C1107892 [76200, 75040, 508, 576].jpeg NaN NaN GGS NaN\n", "48 glomerulus C1107892 [77772, 25272, 740, 760].jpeg NaN NaN GGS NaN\n", "49 glomerulus C1107892 [77980, 73584, 732, 724].jpeg NaN NaN GGS NaN\n", "55 glomerulus C1107892 [80568, 69696, 616, 644].jpeg NaN NaN GGS NaN\n", "56 glomerulus C1107892 [80608, 21544, 624, 660].jpeg NaN NaN GGS NaN\n", "11 glomerulus C1105642 [136108, 72452, 612, 532].... NaN NaN GGS NaN\n", "12 glomerulus C1105642 [136892, 73056, 596, 540].... NaN NaN GGS NaN\n", "13 glomerulus C1105642 [137860, 71816, 640, 728].... NaN NaN GGS NaN\n", "18 glomerulus C1105642 [140788, 20956, 616, 548].... NaN NaN GGS NaN\n", "19 glomerulus C1105642 [141656, 21460, 620, 576].... NaN NaN GGS NaN\n", "20 glomerulus C1105642 [142460, 20320, 540, 512].... NaN NaN GGS NaN\n", "22 glomerulus C1105642 [14640, 21940, 524, 584].jpeg NaN NaN GGS NaN\n", "29 glomerulus C1105642 [64876, 12060, 596, 648].jpeg NaN NaN GGS NaN\n", "33 glomerulus C1105642 [67600, 62876, 656, 680].jpeg NaN NaN GGS NaN\n", "35 glomerulus C1105642 [68388, 15580, 644, 604].jpeg NaN NaN GGS NaN\n", "40 glomerulus C1105642 [70972, 66596, 652, 628].jpeg NaN NaN GGS NaN\n", "41 glomerulus C1105642 [71324, 17312, 560, 556].jpeg NaN NaN GGS NaN\n", "46 glomerulus C1105642 [72752, 20572, 620, 524].jpeg NaN NaN GGS NaN\n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg NaN NaN GGS NaN\n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg NaN NaN GGS NaN\n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg NaN NaN GGS NaN\n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg NaN NaN GGS NaN\n" ] } ], "source": [ "nan_rows = df_combined[df_combined.isnull().any(axis=1)]\n", "print(nan_rows)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Patch namesMESC
1glomerulus C1107752 [130360, 32956, 1020, 1008...yesMyesENoGSyesC
6glomerulus C1107752 [135308, 69504, 1012, 1004...yesMnoENoGSyesC
10glomerulus C1107752 [137584, 31764, 836, 872]....yesMnoENoGSyesC
39glomerulus C1107752 [87436, 35528, 724, 844].jpegyesMnoENoGSyesC
2glomerulus C1105642 [120200, 56808, 1304, 1140...yesMnoESGSyesC
\n", "
" ], "text/plain": [ " Patch names M E S C\n", "1 glomerulus C1107752 [130360, 32956, 1020, 1008... yesM yesE NoGS yesC\n", "6 glomerulus C1107752 [135308, 69504, 1012, 1004... yesM noE NoGS yesC\n", "10 glomerulus C1107752 [137584, 31764, 836, 872].... yesM noE NoGS yesC\n", "39 glomerulus C1107752 [87436, 35528, 724, 844].jpeg yesM noE NoGS yesC\n", "2 glomerulus C1105642 [120200, 56808, 1304, 1140... yesM noE SGS yesC" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# print the rows with yesC in the C column\n", "yesC_rows = df_combined[df_combined[\"C\"] == \"yesC\"]\n", "yesC_rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Separate the patches into train and val sets \n", "Test set needs to be added but we didn't have enough data so we decided to use the validation set as the test set." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Seed is -828\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WSI images have been split into train and val folders.\n" ] } ], "source": [ "import random\n", "import shutil\n", "import sys\n", "\n", "# Set the path to the Crop-256 folder\n", "crop256_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Crops\"\n", "\n", "# Set the path to the Data/Classification folder\n", "dataset_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification\"\n", "\n", "# Set the train and val ratio\n", "train_ratio = 0.7\n", "val_ratio = 0.3\n", "\n", "# Create the train and val folders\n", "train_folder = os.path.join(dataset_folder, \"train\")\n", "val_folder = os.path.join(dataset_folder, \"val\")\n", "os.makedirs(train_folder, exist_ok=True)\n", "os.makedirs(val_folder, exist_ok=True)\n", "\n", "# If the train and val folders are not empty, ask the user to confirm if they want to overwrite the folders\n", "if len(os.listdir(train_folder)) > 0 or len(os.listdir(val_folder)) > 0:\n", " response = input(\"The train and val folders are not empty. Do you want to overwrite the folders? (yes/no): \")\n", " if response.lower() != \"yes\":\n", " print(\"Exiting the script.\")\n", " sys.exit()\n", " if response.lower() == \"yes\":\n", " # Remove the existing folders\n", " shutil.rmtree(train_folder)\n", " shutil.rmtree(val_folder)\n", " # Create the folders again\n", " os.makedirs(train_folder, exist_ok=True)\n", " os.makedirs(val_folder, exist_ok=True)\n", " \n", "# Get the list of WSI folders in the Crop-256 folder\n", "wsi_folders = [wsi for wsi in os.listdir(crop256_folder)]\n", "\n", "# Shuffle the list of WSI images\n", "seed = random.randint(-1000, 1000)\n", "print(f\"Seed is {seed}\")\n", "random.seed(seed) # Allows for reproducibility\n", "\n", "imgs = []\n", "os.makedirs(os.path.join(train_folder), exist_ok=True)\n", "for wsi in wsi_folders:\n", " # Copy the images to the train folder\n", " for image in os.listdir(os.path.join(crop256_folder, wsi)):\n", " src_path = os.path.join(crop256_folder, wsi, image)\n", " dst_path = os.path.join(dataset_folder, image)\n", " imgs.append(image)\n", " shutil.copy(src_path, dst_path)\n", "\n", "# Shuffle the list of image paths\n", "random.seed(seed) # Allows for reproducibility\n", "random.shuffle(imgs)\n", "\n", "# Split the image paths into train and val sets\n", "train_size = int(train_ratio * len(imgs))\n", "train_imgs = imgs[:train_size]\n", "val_imgs = imgs[train_size:]\n", "\n", "# Copy the train images to the train folder\n", "os.makedirs(os.path.join(train_folder), exist_ok=True)\n", "# Copy the images to the train folder\n", "for image in train_imgs:\n", " src_path = os.path.join(dataset_folder, image)\n", " dst_path = os.path.join(train_folder, image)\n", " shutil.copy(src_path, dst_path)\n", " \n", "# Create the folder in the val folder\n", "os.makedirs(os.path.join(val_folder), exist_ok=True)\n", "# Copy the images to the val folder\n", "for image in val_imgs:\n", " src_path = os.path.join(dataset_folder, image)\n", " dst_path = os.path.join(val_folder, image)\n", " shutil.copy(src_path, dst_path)\n", "\n", "# Remove the images from the dataset folder\n", "for image in imgs:\n", " os.remove(os.path.join(dataset_folder, image))\n", "\n", "print(\"WSI images have been split into train and val folders.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sort the patches into their respective classes" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Set the path to the train and val folders\n", "train_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/train\"\n", "val_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/val\"\n", "\n", "# Create new subdirectories for the labels in the train and val folders \n", "for label in possible_labels:\n", " os.makedirs(os.path.join(train_folder, label), exist_ok=True)\n", " os.makedirs(os.path.join(val_folder, label), exist_ok=True)\n", " \n", "# Iterate over the rows in the df_combined dataframe\n", "for index, row in df_combined.iterrows():\n", " # Get the labels of the current row\n", " labels = row[[\"M\", \"E\", \"S\", \"C\"]]\n", " \n", " # Get the name of the current patch\n", " patch_name = row[\"Patch names\"]\n", " \n", " # Set the source path of the image\n", " if patch_name in os.listdir(train_folder):\n", " source_path = os.path.join(train_folder, patch_name)\n", " elif patch_name in os.listdir(val_folder):\n", " source_path = os.path.join(val_folder, patch_name)\n", " \n", " # Set the destination paths of the image\n", " for label in labels:\n", " if label in possible_labels:\n", " if source_path.split(\"/\")[-2] == \"train\":\n", " dest_path = os.path.join(train_folder, label)\n", " else:\n", " dest_path = os.path.join(val_folder, label)\n", " if patch_name in os.listdir(dest_path):\n", " pass\n", " else:\n", " shutil.copy(source_path, dest_path)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Delete all the images in the train and val folders that are not in subdirectories\n", "for image in os.listdir(train_folder):\n", " if os.path.isfile(os.path.join(train_folder, image)):\n", " os.remove(os.path.join(train_folder, image))\n", " \n", "for image in os.listdir(val_folder):\n", " if os.path.isfile(os.path.join(val_folder, image)):\n", " os.remove(os.path.join(val_folder, image))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Create folders for each type of lesion\n", "lesion_folders = [\"M\", \"E\", \"S\", \"C\"]\n", "for lesion in lesion_folders:\n", " lesion_path = os.path.join(dataset_folder, lesion)\n", " os.makedirs(lesion_path, exist_ok=True)\n", " for step in [\"train\", \"val\"]:\n", " os.makedirs(os.path.join(lesion_path, step), exist_ok=True)\n", " if lesion == \"M\":\n", " os.makedirs(os.path.join(lesion_path, step, \"nan_label\"), exist_ok=True)\n", " os.makedirs(os.path.join(lesion_path, step, \"noM\"), exist_ok=True)\n", " os.makedirs(os.path.join(lesion_path, step, \"yesM\"), exist_ok=True)\n", " if lesion == \"E\":\n", " os.makedirs(os.path.join(lesion_path, step, \"noE\"), exist_ok=True)\n", " os.makedirs(os.path.join(lesion_path, step, \"yesE\"), exist_ok=True)\n", " if lesion == \"S\":\n", " os.makedirs(os.path.join(lesion_path, step, \"GGS\"), exist_ok=True)\n", " os.makedirs(os.path.join(lesion_path, step, \"NoGS\"), exist_ok=True)\n", " os.makedirs(os.path.join(lesion_path, step, \"SGS\"), exist_ok=True)\n", " if lesion == \"C\":\n", " os.makedirs(os.path.join(lesion_path, step, \"noC\"), exist_ok=True)\n", " os.makedirs(os.path.join(lesion_path, step, \"yesC\"), exist_ok=True)\n", " \n", "# Move the images to the appropriate folders\n", "lesion_labels_dict = {\n", " \"M\": [\"nan_label\", \"noM\", \"yesM\"],\n", " \"E\": [\"noE\", \"yesE\"],\n", " \"S\": [\"GGS\", \"NoGS\", \"SGS\"],\n", " \"C\": [\"noC\", \"yesC\"]\n", "}\n", "\n", "# Add the possibility to empty the folders if they are not empty\n", "for lesion in lesion_folders:\n", " for step in [\"train\", \"val\"]:\n", " for label in lesion_labels_dict[lesion]:\n", " if len(os.listdir(os.path.join(dataset_folder, lesion, step, label))) > 0:\n", " response = input(f\"The {lesion}/{step}/{label} folder is not empty. Do you want to empty the folder? (yes/no): \")\n", " if response.lower() == \"yes\":\n", " shutil.rmtree(os.path.join(dataset_folder, lesion, step, label))\n", " os.makedirs(os.path.join(dataset_folder, lesion, step, label), exist_ok=True)\n", " \n", "# Move the images to the appropriate folders \n", "for lesion in lesion_labels_dict.keys():\n", " for step in [\"train\", \"val\"]:\n", " for label in lesion_labels_dict[lesion]:\n", " source_folder = os.path.join(dataset_folder, step, label)\n", " destination_folder = os.path.join(dataset_folder, lesion, step, label)\n", " for image in os.listdir(source_folder):\n", " source_path = os.path.join(source_folder, image)\n", " destination_path = os.path.join(destination_folder, image)\n", " shutil.move(source_path, destination_path)\n", " os.rmdir(source_folder)\n", "\n", "os.rmdir(train_folder)\n", "os.rmdir(val_folder)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "M: 416 images\n", "E: 414 images\n", "S: 465 images\n", "C: 415 images\n" ] } ], "source": [ "# Give the amount of images by lesion\n", "for lesion in lesion_folders:\n", " num_images = 0\n", " for step in [\"train\", \"val\"]:\n", " for label in lesion_labels_dict[lesion]:\n", " num_images += len(os.listdir(os.path.join(dataset_folder, lesion, step, label)))\n", " print(f\"{lesion}: {num_images} images\")" ] } ], "metadata": { "kernelspec": { "display_name": "segmentation", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }