## Delete the lines with a brown background color in the excel files
The excel files are located in the Data/Classification/labeled_data folder of the MESCnn repository.

In [4]:
from openpyxl import Workbook, load_workbook
import os 

path_to_excel = "/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels"

# Function to get the RGB value of a color
def get_rgb(color):
    return tuple(int(color[i:i+2], 16) for i in (0, 2, 4))

for file in os.listdir(path_to_excel):
    if file.endswith(".xlsx") or file.endswith(".XLSX"):
        file = os.path.join(path_to_excel, file)
        # Load the workbook
        workbook = load_workbook(file)
        
        # Select the first sheet
        sheet = workbook.active
 
        # Create a new workbook
        new_workbook = Workbook()
        new_sheet = new_workbook.active
        
        # List to store rows with RGB colors
        rows_with_rgb = []
        
        # Iterate through each row
        for row_idx, row in enumerate(sheet.iter_rows(), start=1):
            row_colors = []
            has_rgb_color = False  # Flag to check if row has any RGB color
            # Iterate through each cell in the row
            for cell in row:
                fill = cell.fill
                if fill.start_color.type == 'rgb':
                    rgb_value = get_rgb(fill.start_color.rgb)
                    row_colors.append(rgb_value)
                    has_rgb_color = True
            # Check if the row has at least one RGB color
            if has_rgb_color:
                rows_with_rgb.append(row)
 
        # Write rows with RGB colors to the new workbook
        for row in rows_with_rgb:
            new_sheet.append([cell.value for cell in row])
        
        # Save the new workbook
        new_workbook.save(file)

## Extract labeled data from excel files

In [5]:
import pandas as pd
   
# Set the path to the labeled data directory
labeled_data_dir = "/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels"

# Get the list of excel files in the labeled data directory
excel_files = [file for file in os.listdir(labeled_data_dir) if file.endswith(".xlsx") or file.endswith(".XLSX")]

# Create an empty dataframe
df_combined = pd.DataFrame(columns=["Patch names", "M", "E", "S", "C"])

# Iterate over the excel files
for file in excel_files:
    print(file)
    # Read the excel file
    df = pd.read_excel(os.path.join(labeled_data_dir, file))
    
    if file == "C1107752_JGI.xlsx": # This file raises an error for a reason I don't understand
        corrected_index = 61  
    else:
        # Find the index of the row with "CORRECTED" or "Corrected" value in the first column
        if (df.iloc[:, 0] == "CORRECTED").any():
            corrected_index = df[df.iloc[:, 0] == "CORRECTED"].index[0]
        elif (df.iloc[:, 0] == "Corrected").any():
            corrected_index = df[df.iloc[:, 0] == "Corrected"].index[0]
        elif (df.iloc[:, 0] == "CORRECTED JGI").any():
            corrected_index = df[df.iloc[:, 0] == "CORRECTED JGI"].index[0]
        else:
            corrected_index = df[df.iloc[:, 0] == "filename"].index[0]        
        
    # Skip the rows before the "CORRECTED" row and select the following rows
    df = df.iloc[corrected_index + 1:]
    
    # Get the values in the M, E, S, and C columns
    m_values = df["M"].values
    e_values = df["E"].values
    s_values = df["S"].values
    c_values = df["C"].values
    
    # Get the name of each patch in the Patch_name column
    patch_names = df["filename"].values
    
    # Split the patch names to keep only the part after the last '\'
    patch_names = [name.split('\\')[-1] for name in patch_names]
    
    # Create a dataframe for the current file
    df_current = pd.DataFrame({
        "Patch names": patch_names,
        "M": m_values,
        "E": e_values,
        "S": s_values,
        "C": c_values
    })
    
    # Append the current dataframe to the combined dataframe
    df_combined = pd.concat([df_combined, df_current])

# Print the combined dataframe
print(df_combined)
print(df_combined.shape)


C1104066_JGI.XLSX
C1105034_JGI.XLSX
C1110748_JGI.xlsx
C1112141_JGI.XLSX
C1105798_JGI.xlsx
C1117893_JGI.xlsx
C1107892_JGI.xlsx
C1107752_JGI.xlsx
C1105642_JGI.XLSX
                                          Patch names          M    E    S  \
0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg          0    0    1   
1   glomerulus C1104066 [142336, 49680, 744, 640]....          0    0  GGS   
2   glomerulus C1104066 [142772, 48280, 1100, 864]...          1    0    0   
3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg          0    0  GGS   
4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg          0    0    1   
..                                                ...        ...  ...  ...   
47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg  nan_label  noE  GGS   
48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg          1    0    1   
49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg  nan_label  noE  GGS   
50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg  nan

In [6]:
mesc_def = {
    "M": {
        0: "noM",
        1: "yesM",
    },
    "E": {
        0: "noE",
        1: "yesE"
    },
    "S": {
        "GGS": "GGS",
        0: "NoGS",
        1: "SGS"
    },
    "C": {
        0: "noC",
        1: "yesC"
    }
}
df_combined["M"] = df_combined["M"].replace(mesc_def["M"])
df_combined["E"] = df_combined["E"].replace(mesc_def["E"])
df_combined["S"] = df_combined["S"].replace(mesc_def["S"])
df_combined["C"] = df_combined["C"].replace(mesc_def["C"])
print(df_combined)

                                          Patch names          M    E     S  \
0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg        noM  noE   SGS   
1   glomerulus C1104066 [142336, 49680, 744, 640]....        noM  noE   GGS   
2   glomerulus C1104066 [142772, 48280, 1100, 864]...       yesM  noE  NoGS   
3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg        noM  noE   GGS   
4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg        noM  noE   SGS   
..                                                ...        ...  ...   ...   
47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg  nan_label  noE   GGS   
48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg       yesM  noE   SGS   
49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg  nan_label  noE   GGS   
50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg  nan_label  noE   GGS   
51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg  nan_label  noE   GGS   

      C  
0   noC  
1   noC  
2   noC  
3   noC  
4

In [7]:
import numpy as np
labels = df_combined[['M', 'E', 'S', 'C']].values.flatten()
distinct_labels = list(set(labels))
print(distinct_labels)

possible_labels = ["noM", "yesM", "noE", "yesE", "GGS", "NoGS", "SGS", "noC", "yesC", "nan_label"]

# Replace values that are not in the possible_labels list with NaN
df_combined.loc[:, 'M':'C'] = df_combined.loc[:, 'M':'C'].apply(lambda x: np.where(x.isin(possible_labels), x, np.nan))

# If the value in the S column is "GGS", set the value in the other columns to NaN
df_combined.loc[df_combined["S"] == "GGS", ["M", "E", "C"]] = np.nan

# Print the updated dataframe
print(df_combined)

['yesE', 'noM', 'noE', 'NoGS', 10, 'yesC', 'noC', 'yesM', 'SGS', 'GGS', nan, 'nan_label']
                                          Patch names     M    E     S    C
0   glomerulus C1104066 [10884, 59188, 956, 948].jpeg   noM  noE   SGS  noC
1   glomerulus C1104066 [142336, 49680, 744, 640]....   NaN  NaN   GGS  NaN
2   glomerulus C1104066 [142772, 48280, 1100, 864]...  yesM  noE  NoGS  noC
3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg   NaN  NaN   GGS  NaN
4   glomerulus C1104066 [28172, 21868, 736, 748].jpeg   noM  noE   SGS  noC
..                                                ...   ...  ...   ...  ...
47  glomerulus C1105642 [73828, 68492, 580, 600].jpeg   NaN  NaN   GGS  NaN
48  glomerulus C1105642 [73928, 69260, 772, 788].jpeg  yesM  noE   SGS  noC
49  glomerulus C1105642 [74416, 19216, 604, 644].jpeg   NaN  NaN   GGS  NaN
50  glomerulus C1105642 [76040, 21156, 568, 544].jpeg   NaN  NaN   GGS  NaN
51  glomerulus C1105642 [76848, 70520, 624, 680].jpeg   NaN  NaN   GGS  Na

In [8]:
nan_rows = df_combined[df_combined.isnull().any(axis=1)]
print(nan_rows)

                                          Patch names     M    E     S    C
1   glomerulus C1104066 [142336, 49680, 744, 640]....   NaN  NaN   GGS  NaN
3   glomerulus C1104066 [153544, 5020, 752, 628].jpeg   NaN  NaN   GGS  NaN
7    glomerulus C1104066 [8044, 62252, 752, 796].jpeg   NaN  NaN   GGS  NaN
15  glomerulus C1104066 [94652, 48228, 636, 644].jpeg   NaN  NaN   GGS  NaN
17  glomerulus C1105034 [150832, 29052, 600, 496]....   NaN  NaN   GGS  NaN
9   glomerulus C1110748 [129452, 5728, 708, 512].jpeg   NaN  NaN   GGS  NaN
19  glomerulus C1110748 [134904, 7652, 776, 692].jpeg   NaN  NaN   GGS  NaN
22  glomerulus C1110748 [136192, 55140, 788, 688]....   NaN  NaN   GGS  NaN
25  glomerulus C1110748 [145592, 41936, 740, 640]....   NaN  NaN   GGS  NaN
40  glomerulus C1110748 [154628, 24972, 804, 684]....   NaN  NaN   GGS  NaN
41  glomerulus C1110748 [155592, 25764, 648, 612]....   NaN  NaN   GGS  NaN
46  glomerulus C1110748 [156748, 71428, 812, 692]....   NaN  NaN   GGS  NaN
48  glomerul

In [9]:
# print the rows with yesC in the C column
yesC_rows = df_combined[df_combined["C"] == "yesC"]
yesC_rows

Unnamed: 0,Patch names,M,E,S,C
1,"glomerulus C1107752 [130360, 32956, 1020, 1008...",yesM,yesE,NoGS,yesC
6,"glomerulus C1107752 [135308, 69504, 1012, 1004...",yesM,noE,NoGS,yesC
10,"glomerulus C1107752 [137584, 31764, 836, 872]....",yesM,noE,NoGS,yesC
39,"glomerulus C1107752 [87436, 35528, 724, 844].jpeg",yesM,noE,NoGS,yesC
2,"glomerulus C1105642 [120200, 56808, 1304, 1140...",yesM,noE,SGS,yesC


## Separate the patches into train and val sets 
Test set needs to be added but we didn't have enough data so we decided to use the validation set as the test set.

In [10]:
import random
import shutil
import sys

# Set the path to the Crop-256 folder
crop256_folder = "/home/wfd/Desktop/Projet_M1/FineTuning/Data/Crops"

# Set the path to the Data/Classification folder
dataset_folder = "/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification"

# Set the train and val ratio
train_ratio = 0.7
val_ratio = 0.3

# Create the train and val folders
train_folder = os.path.join(dataset_folder, "train")
val_folder = os.path.join(dataset_folder, "val")
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)

# If the train and val folders are not empty, ask the user to confirm if they want to overwrite the folders
if len(os.listdir(train_folder)) > 0 or len(os.listdir(val_folder)) > 0:
    response = input("The train and val folders are not empty. Do you want to overwrite the folders? (yes/no): ")
    if response.lower() != "yes":
        print("Exiting the script.")
        sys.exit()
    if response.lower() == "yes":
        # Remove the existing folders
        shutil.rmtree(train_folder)
        shutil.rmtree(val_folder)
        # Create the folders again
        os.makedirs(train_folder, exist_ok=True)
        os.makedirs(val_folder, exist_ok=True)
        
# Get the list of WSI folders in the Crop-256 folder
wsi_folders = [wsi for wsi in os.listdir(crop256_folder)]

# Shuffle the list of WSI images
seed = random.randint(-1000, 1000)
print(f"Seed is {seed}")
random.seed(seed) # Allows for reproducibility

imgs = []
os.makedirs(os.path.join(train_folder), exist_ok=True)
for wsi in wsi_folders:
    # Copy the images to the train folder
    for image in os.listdir(os.path.join(crop256_folder, wsi)):
        src_path = os.path.join(crop256_folder, wsi, image)
        dst_path = os.path.join(dataset_folder, image)
        imgs.append(image)
        shutil.copy(src_path, dst_path)

# Shuffle the list of image paths
random.seed(seed) # Allows for reproducibility
random.shuffle(imgs)

# Split the image paths into train and val sets
train_size = int(train_ratio * len(imgs))
train_imgs = imgs[:train_size]
val_imgs = imgs[train_size:]

# Copy the train images to the train folder
os.makedirs(os.path.join(train_folder), exist_ok=True)
# Copy the images to the train folder
for image in train_imgs:
    src_path = os.path.join(dataset_folder, image)
    dst_path = os.path.join(train_folder, image)
    shutil.copy(src_path, dst_path)
        
# Create the folder in the val folder
os.makedirs(os.path.join(val_folder), exist_ok=True)
# Copy the images to the val folder
for image in val_imgs:
    src_path = os.path.join(dataset_folder, image)
    dst_path = os.path.join(val_folder, image)
    shutil.copy(src_path, dst_path)

# Remove the images from the dataset folder
for image in imgs:
    os.remove(os.path.join(dataset_folder, image))

print("WSI images have been split into train and val folders.")

Seed is -828


WSI images have been split into train and val folders.


## Sort the patches into their respective classes

In [11]:
# Set the path to the train and val folders
train_folder = "/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/train"
val_folder = "/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/val"

# Create new subdirectories for the labels in the train and val folders 
for label in possible_labels:
    os.makedirs(os.path.join(train_folder, label), exist_ok=True)
    os.makedirs(os.path.join(val_folder, label), exist_ok=True)
    
# Iterate over the rows in the df_combined dataframe
for index, row in df_combined.iterrows():
    # Get the labels of the current row
    labels = row[["M", "E", "S", "C"]]
    
    # Get the name of the current patch
    patch_name = row["Patch names"]
    
    # Set the source path of the image
    if patch_name in os.listdir(train_folder):
        source_path = os.path.join(train_folder, patch_name)
    elif patch_name in os.listdir(val_folder):
        source_path = os.path.join(val_folder, patch_name)
    
    # Set the destination paths of the image
    for label in labels:
        if label in possible_labels:
            if source_path.split("/")[-2] == "train":
                dest_path = os.path.join(train_folder, label)
            else:
                dest_path = os.path.join(val_folder, label)
            if patch_name in os.listdir(dest_path):
                pass
            else:
                shutil.copy(source_path, dest_path)

In [12]:
# Delete all the images in the train and val folders that are not in subdirectories
for image in os.listdir(train_folder):
    if os.path.isfile(os.path.join(train_folder, image)):
        os.remove(os.path.join(train_folder, image))
        
for image in os.listdir(val_folder):
    if os.path.isfile(os.path.join(val_folder, image)):
        os.remove(os.path.join(val_folder, image))

In [13]:
# Create folders for each type of lesion
lesion_folders = ["M", "E", "S", "C"]
for lesion in lesion_folders:
    lesion_path = os.path.join(dataset_folder, lesion)
    os.makedirs(lesion_path, exist_ok=True)
    for step in ["train", "val"]:
        os.makedirs(os.path.join(lesion_path, step), exist_ok=True)
        if lesion == "M":
            os.makedirs(os.path.join(lesion_path, step, "nan_label"), exist_ok=True)
            os.makedirs(os.path.join(lesion_path, step, "noM"), exist_ok=True)
            os.makedirs(os.path.join(lesion_path, step, "yesM"), exist_ok=True)
        if lesion == "E":
            os.makedirs(os.path.join(lesion_path, step, "noE"), exist_ok=True)
            os.makedirs(os.path.join(lesion_path, step, "yesE"), exist_ok=True)
        if lesion == "S":
            os.makedirs(os.path.join(lesion_path, step, "GGS"), exist_ok=True)
            os.makedirs(os.path.join(lesion_path, step, "NoGS"), exist_ok=True)
            os.makedirs(os.path.join(lesion_path, step, "SGS"), exist_ok=True)
        if lesion == "C":
            os.makedirs(os.path.join(lesion_path, step, "noC"), exist_ok=True)
            os.makedirs(os.path.join(lesion_path, step, "yesC"), exist_ok=True)
            
# Move the images to the appropriate folders
lesion_labels_dict = {
    "M": ["nan_label", "noM", "yesM"],
    "E": ["noE", "yesE"],
    "S": ["GGS", "NoGS", "SGS"],
    "C": ["noC", "yesC"]
}

# Add the possibility to empty the folders if they are not empty
for lesion in lesion_folders:
    for step in ["train", "val"]:
        for label in lesion_labels_dict[lesion]:
            if len(os.listdir(os.path.join(dataset_folder, lesion, step, label))) > 0:
                response = input(f"The {lesion}/{step}/{label} folder is not empty. Do you want to empty the folder? (yes/no): ")
                if response.lower() == "yes":
                    shutil.rmtree(os.path.join(dataset_folder, lesion, step, label))
                    os.makedirs(os.path.join(dataset_folder, lesion, step, label), exist_ok=True)
                    
# Move the images to the appropriate folders                  
for lesion in lesion_labels_dict.keys():
    for step in ["train", "val"]:
        for label in lesion_labels_dict[lesion]:
            source_folder = os.path.join(dataset_folder, step, label)
            destination_folder = os.path.join(dataset_folder, lesion, step, label)
            for image in os.listdir(source_folder):
                source_path = os.path.join(source_folder, image)
                destination_path = os.path.join(destination_folder, image)
                shutil.move(source_path, destination_path)
            os.rmdir(source_folder)

os.rmdir(train_folder)
os.rmdir(val_folder)

In [14]:
# Give the amount of images by lesion
for lesion in lesion_folders:
    num_images = 0
    for step in ["train", "val"]:
        for label in lesion_labels_dict[lesion]:
            num_images += len(os.listdir(os.path.join(dataset_folder, lesion, step, label)))
    print(f"{lesion}: {num_images} images")

M: 416 images
E: 414 images
S: 465 images
C: 415 images
