{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Delete the lines with a brown background color in the excel files\n", "The excel files are located in the Data/Classification/labeled_data folder of the MESCnn repository." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from openpyxl import Workbook, load_workbook\n", "import os \n", "\n", "path_to_excel = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n", "\n", "# Function to get the RGB value of a color\n", "def get_rgb(color):\n", " return tuple(int(color[i:i+2], 16) for i in (0, 2, 4))\n", "\n", "for file in os.listdir(path_to_excel):\n", " if file.endswith(\".xlsx\") or file.endswith(\".XLSX\"):\n", " file = os.path.join(path_to_excel, file)\n", " # Load the workbook\n", " workbook = load_workbook(file)\n", " \n", " # Select the first sheet\n", " sheet = workbook.active\n", " \n", " # Create a new workbook\n", " new_workbook = Workbook()\n", " new_sheet = new_workbook.active\n", " \n", " # List to store rows with RGB colors\n", " rows_with_rgb = []\n", " \n", " # Iterate through each row\n", " for row_idx, row in enumerate(sheet.iter_rows(), start=1):\n", " row_colors = []\n", " has_rgb_color = False # Flag to check if row has any RGB color\n", " # Iterate through each cell in the row\n", " for cell in row:\n", " fill = cell.fill\n", " if fill.start_color.type == 'rgb':\n", " rgb_value = get_rgb(fill.start_color.rgb)\n", " row_colors.append(rgb_value)\n", " has_rgb_color = True\n", " # Check if the row has at least one RGB color\n", " if has_rgb_color:\n", " rows_with_rgb.append(row)\n", " \n", " # Write rows with RGB colors to the new workbook\n", " for row in rows_with_rgb:\n", " new_sheet.append([cell.value for cell in row])\n", " \n", " # Save the new workbook\n", " new_workbook.save(file)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract labeled data from excel files" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C1104066_JGI.XLSX\n", "C1105034_JGI.XLSX\n", "C1110748_JGI.xlsx\n", "C1112141_JGI.XLSX\n", "C1105798_JGI.xlsx\n", "C1117893_JGI.xlsx\n", "C1107892_JGI.xlsx\n", "C1107752_JGI.xlsx\n", "C1105642_JGI.XLSX\n", " Patch names M E S \\\n", "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg 0 0 1 \n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... 0 0 GGS \n", "2 glomerulus C1104066 [142772, 48280, 1100, 864]... 1 0 0 \n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg 0 0 GGS \n", "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg 0 0 1 \n", ".. ... ... ... ... \n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg nan_label noE GGS \n", "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg 1 0 1 \n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg nan_label noE GGS \n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg nan_label noE GGS \n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg nan_label noE GGS \n", "\n", " C \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", ".. ... \n", "47 noC \n", "48 0 \n", "49 noC \n", "50 noC \n", "51 noC \n", "\n", "[470 rows x 5 columns]\n", "(470, 5)\n" ] } ], "source": [ "import pandas as pd\n", " \n", "# Set the path to the labeled data directory\n", "labeled_data_dir = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n", "\n", "# Get the list of excel files in the labeled data directory\n", "excel_files = [file for file in os.listdir(labeled_data_dir) if file.endswith(\".xlsx\") or file.endswith(\".XLSX\")]\n", "\n", "# Create an empty dataframe\n", "df_combined = pd.DataFrame(columns=[\"Patch names\", \"M\", \"E\", \"S\", \"C\"])\n", "\n", "# Iterate over the excel files\n", "for file in excel_files:\n", " print(file)\n", " # Read the excel file\n", " df = pd.read_excel(os.path.join(labeled_data_dir, file))\n", " \n", " if file == \"C1107752_JGI.xlsx\": # This file raises an error for a reason I don't understand\n", " corrected_index = 61 \n", " else:\n", " # Find the index of the row with \"CORRECTED\" or \"Corrected\" value in the first column\n", " if (df.iloc[:, 0] == \"CORRECTED\").any():\n", " corrected_index = df[df.iloc[:, 0] == \"CORRECTED\"].index[0]\n", " elif (df.iloc[:, 0] == \"Corrected\").any():\n", " corrected_index = df[df.iloc[:, 0] == \"Corrected\"].index[0]\n", " elif (df.iloc[:, 0] == \"CORRECTED JGI\").any():\n", " corrected_index = df[df.iloc[:, 0] == \"CORRECTED JGI\"].index[0]\n", " else:\n", " corrected_index = df[df.iloc[:, 0] == \"filename\"].index[0] \n", " \n", " # Skip the rows before the \"CORRECTED\" row and select the following rows\n", " df = df.iloc[corrected_index + 1:]\n", " \n", " # Get the values in the M, E, S, and C columns\n", " m_values = df[\"M\"].values\n", " e_values = df[\"E\"].values\n", " s_values = df[\"S\"].values\n", " c_values = df[\"C\"].values\n", " \n", " # Get the name of each patch in the Patch_name column\n", " patch_names = df[\"filename\"].values\n", " \n", " # Split the patch names to keep only the part after the last '\\'\n", " patch_names = [name.split('\\\\')[-1] for name in patch_names]\n", " \n", " # Create a dataframe for the current file\n", " df_current = pd.DataFrame({\n", " \"Patch names\": patch_names,\n", " \"M\": m_values,\n", " \"E\": e_values,\n", " \"S\": s_values,\n", " \"C\": c_values\n", " })\n", " \n", " # Append the current dataframe to the combined dataframe\n", " df_combined = pd.concat([df_combined, df_current])\n", "\n", "# Print the combined dataframe\n", "print(df_combined)\n", "print(df_combined.shape)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Patch names M E S \\\n", "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg noM noE SGS \n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... noM noE GGS \n", "2 glomerulus C1104066 [142772, 48280, 1100, 864]... yesM noE NoGS \n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg noM noE GGS \n", "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg noM noE SGS \n", ".. ... ... ... ... \n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg nan_label noE GGS \n", "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg yesM noE SGS \n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg nan_label noE GGS \n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg nan_label noE GGS \n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg nan_label noE GGS \n", "\n", " C \n", "0 noC \n", "1 noC \n", "2 noC \n", "3 noC \n", "4 noC \n", ".. ... \n", "47 noC \n", "48 noC \n", "49 noC \n", "50 noC \n", "51 noC \n", "\n", "[470 rows x 5 columns]\n" ] } ], "source": [ "mesc_def = {\n", " \"M\": {\n", " 0: \"noM\",\n", " 1: \"yesM\",\n", " },\n", " \"E\": {\n", " 0: \"noE\",\n", " 1: \"yesE\"\n", " },\n", " \"S\": {\n", " \"GGS\": \"GGS\",\n", " 0: \"NoGS\",\n", " 1: \"SGS\"\n", " },\n", " \"C\": {\n", " 0: \"noC\",\n", " 1: \"yesC\"\n", " }\n", "}\n", "df_combined[\"M\"] = df_combined[\"M\"].replace(mesc_def[\"M\"])\n", "df_combined[\"E\"] = df_combined[\"E\"].replace(mesc_def[\"E\"])\n", "df_combined[\"S\"] = df_combined[\"S\"].replace(mesc_def[\"S\"])\n", "df_combined[\"C\"] = df_combined[\"C\"].replace(mesc_def[\"C\"])\n", "print(df_combined)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['yesE', 'noM', 'noE', 'NoGS', 10, 'yesC', 'noC', 'yesM', 'SGS', 'GGS', nan, 'nan_label']\n", " Patch names M E S C\n", "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg noM noE SGS noC\n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... NaN NaN GGS NaN\n", "2 glomerulus C1104066 [142772, 48280, 1100, 864]... yesM noE NoGS noC\n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg NaN NaN GGS NaN\n", "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg noM noE SGS noC\n", ".. ... ... ... ... ...\n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg NaN NaN GGS NaN\n", "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg yesM noE SGS noC\n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg NaN NaN GGS NaN\n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg NaN NaN GGS NaN\n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg NaN NaN GGS NaN\n", "\n", "[470 rows x 5 columns]\n" ] } ], "source": [ "import numpy as np\n", "labels = df_combined[['M', 'E', 'S', 'C']].values.flatten()\n", "distinct_labels = list(set(labels))\n", "print(distinct_labels)\n", "\n", "possible_labels = [\"noM\", \"yesM\", \"noE\", \"yesE\", \"GGS\", \"NoGS\", \"SGS\", \"noC\", \"yesC\", \"nan_label\"]\n", "\n", "# Replace values that are not in the possible_labels list with NaN\n", "df_combined.loc[:, 'M':'C'] = df_combined.loc[:, 'M':'C'].apply(lambda x: np.where(x.isin(possible_labels), x, np.nan))\n", "\n", "# If the value in the S column is \"GGS\", set the value in the other columns to NaN\n", "df_combined.loc[df_combined[\"S\"] == \"GGS\", [\"M\", \"E\", \"C\"]] = np.nan\n", "\n", "# Print the updated dataframe\n", "print(df_combined)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Patch names M E S C\n", "1 glomerulus C1104066 [142336, 49680, 744, 640].... NaN NaN GGS NaN\n", "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg NaN NaN GGS NaN\n", "7 glomerulus C1104066 [8044, 62252, 752, 796].jpeg NaN NaN GGS NaN\n", "15 glomerulus C1104066 [94652, 48228, 636, 644].jpeg NaN NaN GGS NaN\n", "17 glomerulus C1105034 [150832, 29052, 600, 496].... NaN NaN GGS NaN\n", "9 glomerulus C1110748 [129452, 5728, 708, 512].jpeg NaN NaN GGS NaN\n", "19 glomerulus C1110748 [134904, 7652, 776, 692].jpeg NaN NaN GGS NaN\n", "22 glomerulus C1110748 [136192, 55140, 788, 688].... NaN NaN GGS NaN\n", "25 glomerulus C1110748 [145592, 41936, 740, 640].... NaN NaN GGS NaN\n", "40 glomerulus C1110748 [154628, 24972, 804, 684].... NaN NaN GGS NaN\n", "41 glomerulus C1110748 [155592, 25764, 648, 612].... NaN NaN GGS NaN\n", "46 glomerulus C1110748 [156748, 71428, 812, 692].... NaN NaN GGS NaN\n", "48 glomerulus C1110748 [157812, 72180, 600, 536].... NaN NaN GGS NaN\n", "36 glomerulus C1112141 [78580, 16560, 656, 788].jpeg NaN NaN GGS NaN\n", "43 glomerulus C1112141 [82724, 17252, 860, 808].jpeg NaN NaN GGS NaN\n", "46 glomerulus C1112141 [83852, 19840, 884, 944].jpeg yesM NaN NoGS noC\n", "48 glomerulus C1112141 [86140, 60432, 720, 776].jpeg NaN NaN GGS NaN\n", "50 glomerulus C1112141 [87964, 20760, 672, 732].jpeg NaN NaN GGS NaN\n", "55 glomerulus C1112141 [90196, 61504, 848, 804].jpeg NaN NaN GGS NaN\n", "58 glomerulus C1112141 [95092, 65612, 680, 668].jpeg NaN NaN GGS NaN\n", "4 glomerulus C1105798 [118952, 9668, 980, 896].jpeg NaN NaN GGS NaN\n", "6 glomerulus C1105798 [120488, 15428, 684, 516].... NaN NaN GGS NaN\n", "14 glomerulus C1105798 [129104, 54064, 708, 576].... NaN NaN GGS NaN\n", "54 glomerulus C1105798 [76196, 61668, 740, 968].jpeg NaN NaN GGS NaN\n", "28 glomerulus C1117893 [26068, 32092, 724, 708].jpeg NaN NaN GGS NaN\n", "32 glomerulus C1117893 [31252, 77564, 700, 696].jpeg NaN NaN GGS NaN\n", "33 glomerulus C1117893 [65224, 17120, 528, 544].jpeg NaN NaN GGS NaN\n", "11 glomerulus C1107892 [126480, 27244, 588, 564].... NaN NaN GGS NaN\n", "43 glomerulus C1107892 [75916, 26668, 564, 572].jpeg NaN NaN GGS NaN\n", "44 glomerulus C1107892 [76200, 75040, 508, 576].jpeg NaN NaN GGS NaN\n", "48 glomerulus C1107892 [77772, 25272, 740, 760].jpeg NaN NaN GGS NaN\n", "49 glomerulus C1107892 [77980, 73584, 732, 724].jpeg NaN NaN GGS NaN\n", "55 glomerulus C1107892 [80568, 69696, 616, 644].jpeg NaN NaN GGS NaN\n", "56 glomerulus C1107892 [80608, 21544, 624, 660].jpeg NaN NaN GGS NaN\n", "11 glomerulus C1105642 [136108, 72452, 612, 532].... NaN NaN GGS NaN\n", "12 glomerulus C1105642 [136892, 73056, 596, 540].... NaN NaN GGS NaN\n", "13 glomerulus C1105642 [137860, 71816, 640, 728].... NaN NaN GGS NaN\n", "18 glomerulus C1105642 [140788, 20956, 616, 548].... NaN NaN GGS NaN\n", "19 glomerulus C1105642 [141656, 21460, 620, 576].... NaN NaN GGS NaN\n", "20 glomerulus C1105642 [142460, 20320, 540, 512].... NaN NaN GGS NaN\n", "22 glomerulus C1105642 [14640, 21940, 524, 584].jpeg NaN NaN GGS NaN\n", "29 glomerulus C1105642 [64876, 12060, 596, 648].jpeg NaN NaN GGS NaN\n", "33 glomerulus C1105642 [67600, 62876, 656, 680].jpeg NaN NaN GGS NaN\n", "35 glomerulus C1105642 [68388, 15580, 644, 604].jpeg NaN NaN GGS NaN\n", "40 glomerulus C1105642 [70972, 66596, 652, 628].jpeg NaN NaN GGS NaN\n", "41 glomerulus C1105642 [71324, 17312, 560, 556].jpeg NaN NaN GGS NaN\n", "46 glomerulus C1105642 [72752, 20572, 620, 524].jpeg NaN NaN GGS NaN\n", "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg NaN NaN GGS NaN\n", "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg NaN NaN GGS NaN\n", "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg NaN NaN GGS NaN\n", "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg NaN NaN GGS NaN\n" ] } ], "source": [ "nan_rows = df_combined[df_combined.isnull().any(axis=1)]\n", "print(nan_rows)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Patch names | \n", "M | \n", "E | \n", "S | \n", "C | \n", "
---|---|---|---|---|---|
1 | \n", "glomerulus C1107752 [130360, 32956, 1020, 1008... | \n", "yesM | \n", "yesE | \n", "NoGS | \n", "yesC | \n", "
6 | \n", "glomerulus C1107752 [135308, 69504, 1012, 1004... | \n", "yesM | \n", "noE | \n", "NoGS | \n", "yesC | \n", "
10 | \n", "glomerulus C1107752 [137584, 31764, 836, 872].... | \n", "yesM | \n", "noE | \n", "NoGS | \n", "yesC | \n", "
39 | \n", "glomerulus C1107752 [87436, 35528, 724, 844].jpeg | \n", "yesM | \n", "noE | \n", "NoGS | \n", "yesC | \n", "
2 | \n", "glomerulus C1105642 [120200, 56808, 1304, 1140... | \n", "yesM | \n", "noE | \n", "SGS | \n", "yesC | \n", "