WladimirLct commited on
Commit
1605ebb
·
verified ·
1 Parent(s): a4a7e2d

Upload 2 files

Browse files
Files changed (2) hide show
  1. dataset_generation.ipynb +750 -0
  2. model_training.ipynb +0 -0
dataset_generation.ipynb ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "## Delete the lines with a brown background color in the excel files\n",
8
+ "The excel files are located in the Data/Classification/labeled_data folder of the MESCnn repository."
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 4,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "from openpyxl import Workbook, load_workbook\n",
18
+ "import os \n",
19
+ "\n",
20
+ "path_to_excel = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n",
21
+ "\n",
22
+ "# Function to get the RGB value of a color\n",
23
+ "def get_rgb(color):\n",
24
+ " return tuple(int(color[i:i+2], 16) for i in (0, 2, 4))\n",
25
+ "\n",
26
+ "for file in os.listdir(path_to_excel):\n",
27
+ " if file.endswith(\".xlsx\") or file.endswith(\".XLSX\"):\n",
28
+ " file = os.path.join(path_to_excel, file)\n",
29
+ " # Load the workbook\n",
30
+ " workbook = load_workbook(file)\n",
31
+ " \n",
32
+ " # Select the first sheet\n",
33
+ " sheet = workbook.active\n",
34
+ " \n",
35
+ " # Create a new workbook\n",
36
+ " new_workbook = Workbook()\n",
37
+ " new_sheet = new_workbook.active\n",
38
+ " \n",
39
+ " # List to store rows with RGB colors\n",
40
+ " rows_with_rgb = []\n",
41
+ " \n",
42
+ " # Iterate through each row\n",
43
+ " for row_idx, row in enumerate(sheet.iter_rows(), start=1):\n",
44
+ " row_colors = []\n",
45
+ " has_rgb_color = False # Flag to check if row has any RGB color\n",
46
+ " # Iterate through each cell in the row\n",
47
+ " for cell in row:\n",
48
+ " fill = cell.fill\n",
49
+ " if fill.start_color.type == 'rgb':\n",
50
+ " rgb_value = get_rgb(fill.start_color.rgb)\n",
51
+ " row_colors.append(rgb_value)\n",
52
+ " has_rgb_color = True\n",
53
+ " # Check if the row has at least one RGB color\n",
54
+ " if has_rgb_color:\n",
55
+ " rows_with_rgb.append(row)\n",
56
+ " \n",
57
+ " # Write rows with RGB colors to the new workbook\n",
58
+ " for row in rows_with_rgb:\n",
59
+ " new_sheet.append([cell.value for cell in row])\n",
60
+ " \n",
61
+ " # Save the new workbook\n",
62
+ " new_workbook.save(file)"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "markdown",
67
+ "metadata": {},
68
+ "source": [
69
+ "## Extract labeled data from excel files"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 5,
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "C1104066_JGI.XLSX\n",
82
+ "C1105034_JGI.XLSX\n",
83
+ "C1110748_JGI.xlsx\n",
84
+ "C1112141_JGI.XLSX\n",
85
+ "C1105798_JGI.xlsx\n",
86
+ "C1117893_JGI.xlsx\n",
87
+ "C1107892_JGI.xlsx\n",
88
+ "C1107752_JGI.xlsx\n",
89
+ "C1105642_JGI.XLSX\n",
90
+ " Patch names M E S \\\n",
91
+ "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg 0 0 1 \n",
92
+ "1 glomerulus C1104066 [142336, 49680, 744, 640].... 0 0 GGS \n",
93
+ "2 glomerulus C1104066 [142772, 48280, 1100, 864]... 1 0 0 \n",
94
+ "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg 0 0 GGS \n",
95
+ "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg 0 0 1 \n",
96
+ ".. ... ... ... ... \n",
97
+ "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg nan_label noE GGS \n",
98
+ "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg 1 0 1 \n",
99
+ "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg nan_label noE GGS \n",
100
+ "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg nan_label noE GGS \n",
101
+ "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg nan_label noE GGS \n",
102
+ "\n",
103
+ " C \n",
104
+ "0 0 \n",
105
+ "1 0 \n",
106
+ "2 0 \n",
107
+ "3 0 \n",
108
+ "4 0 \n",
109
+ ".. ... \n",
110
+ "47 noC \n",
111
+ "48 0 \n",
112
+ "49 noC \n",
113
+ "50 noC \n",
114
+ "51 noC \n",
115
+ "\n",
116
+ "[470 rows x 5 columns]\n",
117
+ "(470, 5)\n"
118
+ ]
119
+ }
120
+ ],
121
+ "source": [
122
+ "import pandas as pd\n",
123
+ " \n",
124
+ "# Set the path to the labeled data directory\n",
125
+ "labeled_data_dir = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Excels\"\n",
126
+ "\n",
127
+ "# Get the list of excel files in the labeled data directory\n",
128
+ "excel_files = [file for file in os.listdir(labeled_data_dir) if file.endswith(\".xlsx\") or file.endswith(\".XLSX\")]\n",
129
+ "\n",
130
+ "# Create an empty dataframe\n",
131
+ "df_combined = pd.DataFrame(columns=[\"Patch names\", \"M\", \"E\", \"S\", \"C\"])\n",
132
+ "\n",
133
+ "# Iterate over the excel files\n",
134
+ "for file in excel_files:\n",
135
+ " print(file)\n",
136
+ " # Read the excel file\n",
137
+ " df = pd.read_excel(os.path.join(labeled_data_dir, file))\n",
138
+ " \n",
139
+ " if file == \"C1107752_JGI.xlsx\": # This file raises an error for a reason I don't understand\n",
140
+ " corrected_index = 61 \n",
141
+ " else:\n",
142
+ " # Find the index of the row with \"CORRECTED\" or \"Corrected\" value in the first column\n",
143
+ " if (df.iloc[:, 0] == \"CORRECTED\").any():\n",
144
+ " corrected_index = df[df.iloc[:, 0] == \"CORRECTED\"].index[0]\n",
145
+ " elif (df.iloc[:, 0] == \"Corrected\").any():\n",
146
+ " corrected_index = df[df.iloc[:, 0] == \"Corrected\"].index[0]\n",
147
+ " elif (df.iloc[:, 0] == \"CORRECTED JGI\").any():\n",
148
+ " corrected_index = df[df.iloc[:, 0] == \"CORRECTED JGI\"].index[0]\n",
149
+ " else:\n",
150
+ " corrected_index = df[df.iloc[:, 0] == \"filename\"].index[0] \n",
151
+ " \n",
152
+ " # Skip the rows before the \"CORRECTED\" row and select the following rows\n",
153
+ " df = df.iloc[corrected_index + 1:]\n",
154
+ " \n",
155
+ " # Get the values in the M, E, S, and C columns\n",
156
+ " m_values = df[\"M\"].values\n",
157
+ " e_values = df[\"E\"].values\n",
158
+ " s_values = df[\"S\"].values\n",
159
+ " c_values = df[\"C\"].values\n",
160
+ " \n",
161
+ " # Get the name of each patch in the Patch_name column\n",
162
+ " patch_names = df[\"filename\"].values\n",
163
+ " \n",
164
+ " # Split the patch names to keep only the part after the last '\\'\n",
165
+ " patch_names = [name.split('\\\\')[-1] for name in patch_names]\n",
166
+ " \n",
167
+ " # Create a dataframe for the current file\n",
168
+ " df_current = pd.DataFrame({\n",
169
+ " \"Patch names\": patch_names,\n",
170
+ " \"M\": m_values,\n",
171
+ " \"E\": e_values,\n",
172
+ " \"S\": s_values,\n",
173
+ " \"C\": c_values\n",
174
+ " })\n",
175
+ " \n",
176
+ " # Append the current dataframe to the combined dataframe\n",
177
+ " df_combined = pd.concat([df_combined, df_current])\n",
178
+ "\n",
179
+ "# Print the combined dataframe\n",
180
+ "print(df_combined)\n",
181
+ "print(df_combined.shape)\n"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "metadata": {},
188
+ "outputs": [
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ " Patch names M E S \\\n",
194
+ "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg noM noE SGS \n",
195
+ "1 glomerulus C1104066 [142336, 49680, 744, 640].... noM noE GGS \n",
196
+ "2 glomerulus C1104066 [142772, 48280, 1100, 864]... yesM noE NoGS \n",
197
+ "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg noM noE GGS \n",
198
+ "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg noM noE SGS \n",
199
+ ".. ... ... ... ... \n",
200
+ "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg nan_label noE GGS \n",
201
+ "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg yesM noE SGS \n",
202
+ "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg nan_label noE GGS \n",
203
+ "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg nan_label noE GGS \n",
204
+ "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg nan_label noE GGS \n",
205
+ "\n",
206
+ " C \n",
207
+ "0 noC \n",
208
+ "1 noC \n",
209
+ "2 noC \n",
210
+ "3 noC \n",
211
+ "4 noC \n",
212
+ ".. ... \n",
213
+ "47 noC \n",
214
+ "48 noC \n",
215
+ "49 noC \n",
216
+ "50 noC \n",
217
+ "51 noC \n",
218
+ "\n",
219
+ "[470 rows x 5 columns]\n"
220
+ ]
221
+ }
222
+ ],
223
+ "source": [
224
+ "mesc_def = {\n",
225
+ " \"M\": {\n",
226
+ " 0: \"noM\",\n",
227
+ " 1: \"yesM\",\n",
228
+ " },\n",
229
+ " \"E\": {\n",
230
+ " 0: \"noE\",\n",
231
+ " 1: \"yesE\"\n",
232
+ " },\n",
233
+ " \"S\": {\n",
234
+ " \"GGS\": \"GGS\",\n",
235
+ " 0: \"NoGS\",\n",
236
+ " 1: \"SGS\"\n",
237
+ " },\n",
238
+ " \"C\": {\n",
239
+ " 0: \"noC\",\n",
240
+ " 1: \"yesC\"\n",
241
+ " }\n",
242
+ "}\n",
243
+ "df_combined[\"M\"] = df_combined[\"M\"].replace(mesc_def[\"M\"])\n",
244
+ "df_combined[\"E\"] = df_combined[\"E\"].replace(mesc_def[\"E\"])\n",
245
+ "df_combined[\"S\"] = df_combined[\"S\"].replace(mesc_def[\"S\"])\n",
246
+ "df_combined[\"C\"] = df_combined[\"C\"].replace(mesc_def[\"C\"])\n",
247
+ "print(df_combined)"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 7,
253
+ "metadata": {},
254
+ "outputs": [
255
+ {
256
+ "name": "stdout",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "['yesE', 'noM', 'noE', 'NoGS', 10, 'yesC', 'noC', 'yesM', 'SGS', 'GGS', nan, 'nan_label']\n",
260
+ " Patch names M E S C\n",
261
+ "0 glomerulus C1104066 [10884, 59188, 956, 948].jpeg noM noE SGS noC\n",
262
+ "1 glomerulus C1104066 [142336, 49680, 744, 640].... NaN NaN GGS NaN\n",
263
+ "2 glomerulus C1104066 [142772, 48280, 1100, 864]... yesM noE NoGS noC\n",
264
+ "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg NaN NaN GGS NaN\n",
265
+ "4 glomerulus C1104066 [28172, 21868, 736, 748].jpeg noM noE SGS noC\n",
266
+ ".. ... ... ... ... ...\n",
267
+ "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg NaN NaN GGS NaN\n",
268
+ "48 glomerulus C1105642 [73928, 69260, 772, 788].jpeg yesM noE SGS noC\n",
269
+ "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg NaN NaN GGS NaN\n",
270
+ "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg NaN NaN GGS NaN\n",
271
+ "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg NaN NaN GGS NaN\n",
272
+ "\n",
273
+ "[470 rows x 5 columns]\n"
274
+ ]
275
+ }
276
+ ],
277
+ "source": [
278
+ "import numpy as np\n",
279
+ "labels = df_combined[['M', 'E', 'S', 'C']].values.flatten()\n",
280
+ "distinct_labels = list(set(labels))\n",
281
+ "print(distinct_labels)\n",
282
+ "\n",
283
+ "possible_labels = [\"noM\", \"yesM\", \"noE\", \"yesE\", \"GGS\", \"NoGS\", \"SGS\", \"noC\", \"yesC\", \"nan_label\"]\n",
284
+ "\n",
285
+ "# Replace values that are not in the possible_labels list with NaN\n",
286
+ "df_combined.loc[:, 'M':'C'] = df_combined.loc[:, 'M':'C'].apply(lambda x: np.where(x.isin(possible_labels), x, np.nan))\n",
287
+ "\n",
288
+ "# If the value in the S column is \"GGS\", set the value in the other columns to NaN\n",
289
+ "df_combined.loc[df_combined[\"S\"] == \"GGS\", [\"M\", \"E\", \"C\"]] = np.nan\n",
290
+ "\n",
291
+ "# Print the updated dataframe\n",
292
+ "print(df_combined)"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": 8,
298
+ "metadata": {},
299
+ "outputs": [
300
+ {
301
+ "name": "stdout",
302
+ "output_type": "stream",
303
+ "text": [
304
+ " Patch names M E S C\n",
305
+ "1 glomerulus C1104066 [142336, 49680, 744, 640].... NaN NaN GGS NaN\n",
306
+ "3 glomerulus C1104066 [153544, 5020, 752, 628].jpeg NaN NaN GGS NaN\n",
307
+ "7 glomerulus C1104066 [8044, 62252, 752, 796].jpeg NaN NaN GGS NaN\n",
308
+ "15 glomerulus C1104066 [94652, 48228, 636, 644].jpeg NaN NaN GGS NaN\n",
309
+ "17 glomerulus C1105034 [150832, 29052, 600, 496].... NaN NaN GGS NaN\n",
310
+ "9 glomerulus C1110748 [129452, 5728, 708, 512].jpeg NaN NaN GGS NaN\n",
311
+ "19 glomerulus C1110748 [134904, 7652, 776, 692].jpeg NaN NaN GGS NaN\n",
312
+ "22 glomerulus C1110748 [136192, 55140, 788, 688].... NaN NaN GGS NaN\n",
313
+ "25 glomerulus C1110748 [145592, 41936, 740, 640].... NaN NaN GGS NaN\n",
314
+ "40 glomerulus C1110748 [154628, 24972, 804, 684].... NaN NaN GGS NaN\n",
315
+ "41 glomerulus C1110748 [155592, 25764, 648, 612].... NaN NaN GGS NaN\n",
316
+ "46 glomerulus C1110748 [156748, 71428, 812, 692].... NaN NaN GGS NaN\n",
317
+ "48 glomerulus C1110748 [157812, 72180, 600, 536].... NaN NaN GGS NaN\n",
318
+ "36 glomerulus C1112141 [78580, 16560, 656, 788].jpeg NaN NaN GGS NaN\n",
319
+ "43 glomerulus C1112141 [82724, 17252, 860, 808].jpeg NaN NaN GGS NaN\n",
320
+ "46 glomerulus C1112141 [83852, 19840, 884, 944].jpeg yesM NaN NoGS noC\n",
321
+ "48 glomerulus C1112141 [86140, 60432, 720, 776].jpeg NaN NaN GGS NaN\n",
322
+ "50 glomerulus C1112141 [87964, 20760, 672, 732].jpeg NaN NaN GGS NaN\n",
323
+ "55 glomerulus C1112141 [90196, 61504, 848, 804].jpeg NaN NaN GGS NaN\n",
324
+ "58 glomerulus C1112141 [95092, 65612, 680, 668].jpeg NaN NaN GGS NaN\n",
325
+ "4 glomerulus C1105798 [118952, 9668, 980, 896].jpeg NaN NaN GGS NaN\n",
326
+ "6 glomerulus C1105798 [120488, 15428, 684, 516].... NaN NaN GGS NaN\n",
327
+ "14 glomerulus C1105798 [129104, 54064, 708, 576].... NaN NaN GGS NaN\n",
328
+ "54 glomerulus C1105798 [76196, 61668, 740, 968].jpeg NaN NaN GGS NaN\n",
329
+ "28 glomerulus C1117893 [26068, 32092, 724, 708].jpeg NaN NaN GGS NaN\n",
330
+ "32 glomerulus C1117893 [31252, 77564, 700, 696].jpeg NaN NaN GGS NaN\n",
331
+ "33 glomerulus C1117893 [65224, 17120, 528, 544].jpeg NaN NaN GGS NaN\n",
332
+ "11 glomerulus C1107892 [126480, 27244, 588, 564].... NaN NaN GGS NaN\n",
333
+ "43 glomerulus C1107892 [75916, 26668, 564, 572].jpeg NaN NaN GGS NaN\n",
334
+ "44 glomerulus C1107892 [76200, 75040, 508, 576].jpeg NaN NaN GGS NaN\n",
335
+ "48 glomerulus C1107892 [77772, 25272, 740, 760].jpeg NaN NaN GGS NaN\n",
336
+ "49 glomerulus C1107892 [77980, 73584, 732, 724].jpeg NaN NaN GGS NaN\n",
337
+ "55 glomerulus C1107892 [80568, 69696, 616, 644].jpeg NaN NaN GGS NaN\n",
338
+ "56 glomerulus C1107892 [80608, 21544, 624, 660].jpeg NaN NaN GGS NaN\n",
339
+ "11 glomerulus C1105642 [136108, 72452, 612, 532].... NaN NaN GGS NaN\n",
340
+ "12 glomerulus C1105642 [136892, 73056, 596, 540].... NaN NaN GGS NaN\n",
341
+ "13 glomerulus C1105642 [137860, 71816, 640, 728].... NaN NaN GGS NaN\n",
342
+ "18 glomerulus C1105642 [140788, 20956, 616, 548].... NaN NaN GGS NaN\n",
343
+ "19 glomerulus C1105642 [141656, 21460, 620, 576].... NaN NaN GGS NaN\n",
344
+ "20 glomerulus C1105642 [142460, 20320, 540, 512].... NaN NaN GGS NaN\n",
345
+ "22 glomerulus C1105642 [14640, 21940, 524, 584].jpeg NaN NaN GGS NaN\n",
346
+ "29 glomerulus C1105642 [64876, 12060, 596, 648].jpeg NaN NaN GGS NaN\n",
347
+ "33 glomerulus C1105642 [67600, 62876, 656, 680].jpeg NaN NaN GGS NaN\n",
348
+ "35 glomerulus C1105642 [68388, 15580, 644, 604].jpeg NaN NaN GGS NaN\n",
349
+ "40 glomerulus C1105642 [70972, 66596, 652, 628].jpeg NaN NaN GGS NaN\n",
350
+ "41 glomerulus C1105642 [71324, 17312, 560, 556].jpeg NaN NaN GGS NaN\n",
351
+ "46 glomerulus C1105642 [72752, 20572, 620, 524].jpeg NaN NaN GGS NaN\n",
352
+ "47 glomerulus C1105642 [73828, 68492, 580, 600].jpeg NaN NaN GGS NaN\n",
353
+ "49 glomerulus C1105642 [74416, 19216, 604, 644].jpeg NaN NaN GGS NaN\n",
354
+ "50 glomerulus C1105642 [76040, 21156, 568, 544].jpeg NaN NaN GGS NaN\n",
355
+ "51 glomerulus C1105642 [76848, 70520, 624, 680].jpeg NaN NaN GGS NaN\n"
356
+ ]
357
+ }
358
+ ],
359
+ "source": [
360
+ "nan_rows = df_combined[df_combined.isnull().any(axis=1)]\n",
361
+ "print(nan_rows)"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 9,
367
+ "metadata": {},
368
+ "outputs": [
369
+ {
370
+ "data": {
371
+ "text/html": [
372
+ "<div>\n",
373
+ "<style scoped>\n",
374
+ " .dataframe tbody tr th:only-of-type {\n",
375
+ " vertical-align: middle;\n",
376
+ " }\n",
377
+ "\n",
378
+ " .dataframe tbody tr th {\n",
379
+ " vertical-align: top;\n",
380
+ " }\n",
381
+ "\n",
382
+ " .dataframe thead th {\n",
383
+ " text-align: right;\n",
384
+ " }\n",
385
+ "</style>\n",
386
+ "<table border=\"1\" class=\"dataframe\">\n",
387
+ " <thead>\n",
388
+ " <tr style=\"text-align: right;\">\n",
389
+ " <th></th>\n",
390
+ " <th>Patch names</th>\n",
391
+ " <th>M</th>\n",
392
+ " <th>E</th>\n",
393
+ " <th>S</th>\n",
394
+ " <th>C</th>\n",
395
+ " </tr>\n",
396
+ " </thead>\n",
397
+ " <tbody>\n",
398
+ " <tr>\n",
399
+ " <th>1</th>\n",
400
+ " <td>glomerulus C1107752 [130360, 32956, 1020, 1008...</td>\n",
401
+ " <td>yesM</td>\n",
402
+ " <td>yesE</td>\n",
403
+ " <td>NoGS</td>\n",
404
+ " <td>yesC</td>\n",
405
+ " </tr>\n",
406
+ " <tr>\n",
407
+ " <th>6</th>\n",
408
+ " <td>glomerulus C1107752 [135308, 69504, 1012, 1004...</td>\n",
409
+ " <td>yesM</td>\n",
410
+ " <td>noE</td>\n",
411
+ " <td>NoGS</td>\n",
412
+ " <td>yesC</td>\n",
413
+ " </tr>\n",
414
+ " <tr>\n",
415
+ " <th>10</th>\n",
416
+ " <td>glomerulus C1107752 [137584, 31764, 836, 872]....</td>\n",
417
+ " <td>yesM</td>\n",
418
+ " <td>noE</td>\n",
419
+ " <td>NoGS</td>\n",
420
+ " <td>yesC</td>\n",
421
+ " </tr>\n",
422
+ " <tr>\n",
423
+ " <th>39</th>\n",
424
+ " <td>glomerulus C1107752 [87436, 35528, 724, 844].jpeg</td>\n",
425
+ " <td>yesM</td>\n",
426
+ " <td>noE</td>\n",
427
+ " <td>NoGS</td>\n",
428
+ " <td>yesC</td>\n",
429
+ " </tr>\n",
430
+ " <tr>\n",
431
+ " <th>2</th>\n",
432
+ " <td>glomerulus C1105642 [120200, 56808, 1304, 1140...</td>\n",
433
+ " <td>yesM</td>\n",
434
+ " <td>noE</td>\n",
435
+ " <td>SGS</td>\n",
436
+ " <td>yesC</td>\n",
437
+ " </tr>\n",
438
+ " </tbody>\n",
439
+ "</table>\n",
440
+ "</div>"
441
+ ],
442
+ "text/plain": [
443
+ " Patch names M E S C\n",
444
+ "1 glomerulus C1107752 [130360, 32956, 1020, 1008... yesM yesE NoGS yesC\n",
445
+ "6 glomerulus C1107752 [135308, 69504, 1012, 1004... yesM noE NoGS yesC\n",
446
+ "10 glomerulus C1107752 [137584, 31764, 836, 872].... yesM noE NoGS yesC\n",
447
+ "39 glomerulus C1107752 [87436, 35528, 724, 844].jpeg yesM noE NoGS yesC\n",
448
+ "2 glomerulus C1105642 [120200, 56808, 1304, 1140... yesM noE SGS yesC"
449
+ ]
450
+ },
451
+ "execution_count": 9,
452
+ "metadata": {},
453
+ "output_type": "execute_result"
454
+ }
455
+ ],
456
+ "source": [
457
+ "# print the rows with yesC in the C column\n",
458
+ "yesC_rows = df_combined[df_combined[\"C\"] == \"yesC\"]\n",
459
+ "yesC_rows"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "markdown",
464
+ "metadata": {},
465
+ "source": [
466
+ "## Separate the patches into train and val sets \n",
467
+ "Test set needs to be added but we didn't have enough data so we decided to use the validation set as the test set."
468
+ ]
469
+ },
470
+ {
471
+ "cell_type": "code",
472
+ "execution_count": 10,
473
+ "metadata": {},
474
+ "outputs": [
475
+ {
476
+ "name": "stdout",
477
+ "output_type": "stream",
478
+ "text": [
479
+ "Seed is -828\n"
480
+ ]
481
+ },
482
+ {
483
+ "name": "stdout",
484
+ "output_type": "stream",
485
+ "text": [
486
+ "WSI images have been split into train and val folders.\n"
487
+ ]
488
+ }
489
+ ],
490
+ "source": [
491
+ "import random\n",
492
+ "import shutil\n",
493
+ "import sys\n",
494
+ "\n",
495
+ "# Set the path to the Crop-256 folder\n",
496
+ "crop256_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Crops\"\n",
497
+ "\n",
498
+ "# Set the path to the Data/Classification folder\n",
499
+ "dataset_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification\"\n",
500
+ "\n",
501
+ "# Set the train and val ratio\n",
502
+ "train_ratio = 0.7\n",
503
+ "val_ratio = 0.3\n",
504
+ "\n",
505
+ "# Create the train and val folders\n",
506
+ "train_folder = os.path.join(dataset_folder, \"train\")\n",
507
+ "val_folder = os.path.join(dataset_folder, \"val\")\n",
508
+ "os.makedirs(train_folder, exist_ok=True)\n",
509
+ "os.makedirs(val_folder, exist_ok=True)\n",
510
+ "\n",
511
+ "# If the train and val folders are not empty, ask the user to confirm if they want to overwrite the folders\n",
512
+ "if len(os.listdir(train_folder)) > 0 or len(os.listdir(val_folder)) > 0:\n",
513
+ " response = input(\"The train and val folders are not empty. Do you want to overwrite the folders? (yes/no): \")\n",
514
+ " if response.lower() != \"yes\":\n",
515
+ " print(\"Exiting the script.\")\n",
516
+ " sys.exit()\n",
517
+ " if response.lower() == \"yes\":\n",
518
+ " # Remove the existing folders\n",
519
+ " shutil.rmtree(train_folder)\n",
520
+ " shutil.rmtree(val_folder)\n",
521
+ " # Create the folders again\n",
522
+ " os.makedirs(train_folder, exist_ok=True)\n",
523
+ " os.makedirs(val_folder, exist_ok=True)\n",
524
+ " \n",
525
+ "# Get the list of WSI folders in the Crop-256 folder\n",
526
+ "wsi_folders = [wsi for wsi in os.listdir(crop256_folder)]\n",
527
+ "\n",
528
+ "# Shuffle the list of WSI images\n",
529
+ "seed = random.randint(-1000, 1000)\n",
530
+ "print(f\"Seed is {seed}\")\n",
531
+ "random.seed(seed) # Allows for reproducibility\n",
532
+ "\n",
533
+ "imgs = []\n",
534
+ "os.makedirs(os.path.join(train_folder), exist_ok=True)\n",
535
+ "for wsi in wsi_folders:\n",
536
+ " # Copy the images to the train folder\n",
537
+ " for image in os.listdir(os.path.join(crop256_folder, wsi)):\n",
538
+ " src_path = os.path.join(crop256_folder, wsi, image)\n",
539
+ " dst_path = os.path.join(dataset_folder, image)\n",
540
+ " imgs.append(image)\n",
541
+ " shutil.copy(src_path, dst_path)\n",
542
+ "\n",
543
+ "# Shuffle the list of image paths\n",
544
+ "random.seed(seed) # Allows for reproducibility\n",
545
+ "random.shuffle(imgs)\n",
546
+ "\n",
547
+ "# Split the image paths into train and val sets\n",
548
+ "train_size = int(train_ratio * len(imgs))\n",
549
+ "train_imgs = imgs[:train_size]\n",
550
+ "val_imgs = imgs[train_size:]\n",
551
+ "\n",
552
+ "# Copy the train images to the train folder\n",
553
+ "os.makedirs(os.path.join(train_folder), exist_ok=True)\n",
554
+ "# Copy the images to the train folder\n",
555
+ "for image in train_imgs:\n",
556
+ " src_path = os.path.join(dataset_folder, image)\n",
557
+ " dst_path = os.path.join(train_folder, image)\n",
558
+ " shutil.copy(src_path, dst_path)\n",
559
+ " \n",
560
+ "# Create the folder in the val folder\n",
561
+ "os.makedirs(os.path.join(val_folder), exist_ok=True)\n",
562
+ "# Copy the images to the val folder\n",
563
+ "for image in val_imgs:\n",
564
+ " src_path = os.path.join(dataset_folder, image)\n",
565
+ " dst_path = os.path.join(val_folder, image)\n",
566
+ " shutil.copy(src_path, dst_path)\n",
567
+ "\n",
568
+ "# Remove the images from the dataset folder\n",
569
+ "for image in imgs:\n",
570
+ " os.remove(os.path.join(dataset_folder, image))\n",
571
+ "\n",
572
+ "print(\"WSI images have been split into train and val folders.\")"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "markdown",
577
+ "metadata": {},
578
+ "source": [
579
+ "## Sort the patches into their respective classes"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": 11,
585
+ "metadata": {},
586
+ "outputs": [],
587
+ "source": [
588
+ "# Set the path to the train and val folders\n",
589
+ "train_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/train\"\n",
590
+ "val_folder = \"/home/wfd/Desktop/Projet_M1/FineTuning/Data/Classification/val\"\n",
591
+ "\n",
592
+ "# Create new subdirectories for the labels in the train and val folders \n",
593
+ "for label in possible_labels:\n",
594
+ " os.makedirs(os.path.join(train_folder, label), exist_ok=True)\n",
595
+ " os.makedirs(os.path.join(val_folder, label), exist_ok=True)\n",
596
+ " \n",
597
+ "# Iterate over the rows in the df_combined dataframe\n",
598
+ "for index, row in df_combined.iterrows():\n",
599
+ " # Get the labels of the current row\n",
600
+ " labels = row[[\"M\", \"E\", \"S\", \"C\"]]\n",
601
+ " \n",
602
+ " # Get the name of the current patch\n",
603
+ " patch_name = row[\"Patch names\"]\n",
604
+ " \n",
605
+ " # Set the source path of the image\n",
606
+ " if patch_name in os.listdir(train_folder):\n",
607
+ " source_path = os.path.join(train_folder, patch_name)\n",
608
+ " elif patch_name in os.listdir(val_folder):\n",
609
+ " source_path = os.path.join(val_folder, patch_name)\n",
610
+ " \n",
611
+ " # Set the destination paths of the image\n",
612
+ " for label in labels:\n",
613
+ " if label in possible_labels:\n",
614
+ " if source_path.split(\"/\")[-2] == \"train\":\n",
615
+ " dest_path = os.path.join(train_folder, label)\n",
616
+ " else:\n",
617
+ " dest_path = os.path.join(val_folder, label)\n",
618
+ " if patch_name in os.listdir(dest_path):\n",
619
+ " pass\n",
620
+ " else:\n",
621
+ " shutil.copy(source_path, dest_path)"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": 12,
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": [
630
+ "# Delete all the images in the train and val folders that are not in subdirectories\n",
631
+ "for image in os.listdir(train_folder):\n",
632
+ " if os.path.isfile(os.path.join(train_folder, image)):\n",
633
+ " os.remove(os.path.join(train_folder, image))\n",
634
+ " \n",
635
+ "for image in os.listdir(val_folder):\n",
636
+ " if os.path.isfile(os.path.join(val_folder, image)):\n",
637
+ " os.remove(os.path.join(val_folder, image))"
638
+ ]
639
+ },
640
+ {
641
+ "cell_type": "code",
642
+ "execution_count": 13,
643
+ "metadata": {},
644
+ "outputs": [],
645
+ "source": [
646
+ "# Create folders for each type of lesion\n",
647
+ "lesion_folders = [\"M\", \"E\", \"S\", \"C\"]\n",
648
+ "for lesion in lesion_folders:\n",
649
+ " lesion_path = os.path.join(dataset_folder, lesion)\n",
650
+ " os.makedirs(lesion_path, exist_ok=True)\n",
651
+ " for step in [\"train\", \"val\"]:\n",
652
+ " os.makedirs(os.path.join(lesion_path, step), exist_ok=True)\n",
653
+ " if lesion == \"M\":\n",
654
+ " os.makedirs(os.path.join(lesion_path, step, \"nan_label\"), exist_ok=True)\n",
655
+ " os.makedirs(os.path.join(lesion_path, step, \"noM\"), exist_ok=True)\n",
656
+ " os.makedirs(os.path.join(lesion_path, step, \"yesM\"), exist_ok=True)\n",
657
+ " if lesion == \"E\":\n",
658
+ " os.makedirs(os.path.join(lesion_path, step, \"noE\"), exist_ok=True)\n",
659
+ " os.makedirs(os.path.join(lesion_path, step, \"yesE\"), exist_ok=True)\n",
660
+ " if lesion == \"S\":\n",
661
+ " os.makedirs(os.path.join(lesion_path, step, \"GGS\"), exist_ok=True)\n",
662
+ " os.makedirs(os.path.join(lesion_path, step, \"NoGS\"), exist_ok=True)\n",
663
+ " os.makedirs(os.path.join(lesion_path, step, \"SGS\"), exist_ok=True)\n",
664
+ " if lesion == \"C\":\n",
665
+ " os.makedirs(os.path.join(lesion_path, step, \"noC\"), exist_ok=True)\n",
666
+ " os.makedirs(os.path.join(lesion_path, step, \"yesC\"), exist_ok=True)\n",
667
+ " \n",
668
+ "# Move the images to the appropriate folders\n",
669
+ "lesion_labels_dict = {\n",
670
+ " \"M\": [\"nan_label\", \"noM\", \"yesM\"],\n",
671
+ " \"E\": [\"noE\", \"yesE\"],\n",
672
+ " \"S\": [\"GGS\", \"NoGS\", \"SGS\"],\n",
673
+ " \"C\": [\"noC\", \"yesC\"]\n",
674
+ "}\n",
675
+ "\n",
676
+ "# Add the possibility to empty the folders if they are not empty\n",
677
+ "for lesion in lesion_folders:\n",
678
+ " for step in [\"train\", \"val\"]:\n",
679
+ " for label in lesion_labels_dict[lesion]:\n",
680
+ " if len(os.listdir(os.path.join(dataset_folder, lesion, step, label))) > 0:\n",
681
+ " response = input(f\"The {lesion}/{step}/{label} folder is not empty. Do you want to empty the folder? (yes/no): \")\n",
682
+ " if response.lower() == \"yes\":\n",
683
+ " shutil.rmtree(os.path.join(dataset_folder, lesion, step, label))\n",
684
+ " os.makedirs(os.path.join(dataset_folder, lesion, step, label), exist_ok=True)\n",
685
+ " \n",
686
+ "# Move the images to the appropriate folders \n",
687
+ "for lesion in lesion_labels_dict.keys():\n",
688
+ " for step in [\"train\", \"val\"]:\n",
689
+ " for label in lesion_labels_dict[lesion]:\n",
690
+ " source_folder = os.path.join(dataset_folder, step, label)\n",
691
+ " destination_folder = os.path.join(dataset_folder, lesion, step, label)\n",
692
+ " for image in os.listdir(source_folder):\n",
693
+ " source_path = os.path.join(source_folder, image)\n",
694
+ " destination_path = os.path.join(destination_folder, image)\n",
695
+ " shutil.move(source_path, destination_path)\n",
696
+ " os.rmdir(source_folder)\n",
697
+ "\n",
698
+ "os.rmdir(train_folder)\n",
699
+ "os.rmdir(val_folder)"
700
+ ]
701
+ },
702
+ {
703
+ "cell_type": "code",
704
+ "execution_count": 14,
705
+ "metadata": {},
706
+ "outputs": [
707
+ {
708
+ "name": "stdout",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "M: 416 images\n",
712
+ "E: 414 images\n",
713
+ "S: 465 images\n",
714
+ "C: 415 images\n"
715
+ ]
716
+ }
717
+ ],
718
+ "source": [
719
+ "# Give the amount of images by lesion\n",
720
+ "for lesion in lesion_folders:\n",
721
+ " num_images = 0\n",
722
+ " for step in [\"train\", \"val\"]:\n",
723
+ " for label in lesion_labels_dict[lesion]:\n",
724
+ " num_images += len(os.listdir(os.path.join(dataset_folder, lesion, step, label)))\n",
725
+ " print(f\"{lesion}: {num_images} images\")"
726
+ ]
727
+ }
728
+ ],
729
+ "metadata": {
730
+ "kernelspec": {
731
+ "display_name": "segmentation",
732
+ "language": "python",
733
+ "name": "python3"
734
+ },
735
+ "language_info": {
736
+ "codemirror_mode": {
737
+ "name": "ipython",
738
+ "version": 3
739
+ },
740
+ "file_extension": ".py",
741
+ "mimetype": "text/x-python",
742
+ "name": "python",
743
+ "nbconvert_exporter": "python",
744
+ "pygments_lexer": "ipython3",
745
+ "version": "3.10.14"
746
+ }
747
+ },
748
+ "nbformat": 4,
749
+ "nbformat_minor": 2
750
+ }
model_training.ipynb ADDED
The diff for this file is too large to render. See raw diff