GotThatData commited on
Commit
b4b0910
·
verified ·
1 Parent(s): b489b3f
Files changed (1) hide show
  1. app.py +105 -30
app.py CHANGED
@@ -10,7 +10,10 @@ import logging
10
  import yaml
11
 
12
  # Set up logging
13
- logging.basicConfig(level=logging.INFO)
 
 
 
14
  logger = logging.getLogger(__name__)
15
 
16
  # Load settings
@@ -26,8 +29,8 @@ def safe_load_dataset(dataset_name):
26
  try:
27
  dataset = load_dataset(dataset_name)
28
  return dataset, len(dataset['train']) if 'train' in dataset else 0
29
- except Exception:
30
- logger.info("No existing dataset found. Starting fresh.")
31
  return None, 0
32
 
33
  def is_valid_image(file_path):
@@ -35,10 +38,21 @@ def is_valid_image(file_path):
35
  try:
36
  with Image.open(file_path) as img:
37
  img.verify()
38
- return True
39
- except:
 
40
  return False
41
 
 
 
 
 
 
 
 
 
 
 
42
  # DatasetManager Class
43
  class DatasetManager:
44
  def __init__(self, local_images_dir="downloaded_cards"):
@@ -66,6 +80,7 @@ class DatasetManager:
66
  self.drive = GoogleDrive(gauth)
67
  return True, "Successfully authenticated with Google Drive"
68
  except Exception as e:
 
69
  return False, f"Authentication failed: {str(e)}"
70
 
71
  def download_and_rename_files(self, drive_folder_id, naming_convention):
@@ -76,35 +91,55 @@ class DatasetManager:
76
  try:
77
  query = f"'{drive_folder_id}' in parents and trashed=false"
78
  file_list = self.drive.ListFile({'q': query}).GetList()
 
79
  if not file_list:
 
80
  return False, "No files found in the specified folder.", []
81
 
82
  existing_dataset, start_index = safe_load_dataset(self.dataset_name)
83
  renamed_files = []
 
 
84
 
85
  for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
86
  if 'mimeType' in file and 'image' in file['mimeType'].lower():
87
- new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
88
  file_path = os.path.join(self.local_images_dir, new_filename)
89
- file.GetContentFile(file_path)
90
-
91
- if is_valid_image(file_path):
92
- renamed_files.append({
93
- 'file_path': file_path,
94
- 'original_name': file['title'],
95
- 'new_name': new_filename
96
- })
97
- logger.info(f"Downloaded and renamed: {file['title']} -> {new_filename}")
98
- else:
99
- logger.error(f"Invalid image detected, removing {file_path}")
100
- os.remove(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- return True, f"Processed {len(renamed_files)} images", renamed_files
103
  except Exception as e:
 
104
  return False, f"Error during download: {str(e)}", []
105
 
106
  def update_huggingface_dataset(self, renamed_files):
107
  """Update Hugging Face dataset with new images."""
 
 
 
108
  try:
109
  df = pd.DataFrame(renamed_files)
110
  new_dataset = Dataset.from_pandas(df)
@@ -118,11 +153,17 @@ class DatasetManager:
118
  combined_dataset.push_to_hub(self.dataset_name, split="train")
119
  return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
120
  except Exception as e:
 
121
  return False, f"Error updating Hugging Face dataset: {str(e)}"
122
 
123
  # Process Pipeline
124
  def process_pipeline(folder_id, naming_convention):
125
  """Main pipeline for processing images and updating dataset."""
 
 
 
 
 
126
  manager = DatasetManager()
127
 
128
  # Step 1: Authenticate Google Drive
@@ -140,26 +181,60 @@ def process_pipeline(folder_id, naming_convention):
140
  return f"{message}\n{hf_message}", renamed_files
141
 
142
  # Gradio Interface
143
- with gr.Blocks() as demo:
144
  gr.Markdown("# Sports Cards Dataset Processor")
145
 
146
  with gr.Box():
147
- gr.Markdown("### Instructions: Upload from Google Drive and Update Hugging Face Dataset")
 
 
 
 
 
 
 
148
 
149
  with gr.Row():
150
- folder_id = gr.Textbox(label="Google Drive Folder ID", placeholder="Enter the folder ID")
151
- naming_convention = gr.Textbox(label="Naming Convention", placeholder="e.g., sports_card")
152
- process_btn = gr.Button("Process Images")
153
-
154
- output = gr.Textbox(label="Status")
155
- output_table = gr.Dataframe(label="Processed Files", headers=["Original Name", "New Name", "File Path"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  def process_ui(folder_id, naming_convention):
158
  status, renamed_files = process_pipeline(folder_id, naming_convention)
159
- table_data = [[file['original_name'], file['new_name'], file['file_path']] for file in renamed_files]
 
160
  return status, table_data
161
 
162
- process_btn.click(process_ui, inputs=[folder_id, naming_convention], outputs=[output, output_table])
 
 
 
 
163
 
164
  if __name__ == "__main__":
165
- demo.launch()
 
10
  import yaml
11
 
12
  # Set up logging
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format='%(asctime)s - %(levelname)s - %(message)s'
16
+ )
17
  logger = logging.getLogger(__name__)
18
 
19
  # Load settings
 
29
  try:
30
  dataset = load_dataset(dataset_name)
31
  return dataset, len(dataset['train']) if 'train' in dataset else 0
32
+ except Exception as e:
33
+ logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}")
34
  return None, 0
35
 
36
  def is_valid_image(file_path):
 
38
  try:
39
  with Image.open(file_path) as img:
40
  img.verify()
41
+ return True
42
+ except Exception as e:
43
+ logger.error(f"Invalid image: {file_path}. Error: {str(e)}")
44
  return False
45
 
46
+ def validate_input(folder_id, naming_convention):
47
+ """Validate user input."""
48
+ if not folder_id or not folder_id.strip():
49
+ return False, "Folder ID cannot be empty"
50
+ if not naming_convention or not naming_convention.strip():
51
+ return False, "Naming convention cannot be empty"
52
+ if not naming_convention.replace('_', '').isalnum():
53
+ return False, "Naming convention should only contain letters, numbers, and underscores"
54
+ return True, ""
55
+
56
  # DatasetManager Class
57
  class DatasetManager:
58
  def __init__(self, local_images_dir="downloaded_cards"):
 
80
  self.drive = GoogleDrive(gauth)
81
  return True, "Successfully authenticated with Google Drive"
82
  except Exception as e:
83
+ logger.error(f"Authentication failed: {str(e)}")
84
  return False, f"Authentication failed: {str(e)}"
85
 
86
  def download_and_rename_files(self, drive_folder_id, naming_convention):
 
91
  try:
92
  query = f"'{drive_folder_id}' in parents and trashed=false"
93
  file_list = self.drive.ListFile({'q': query}).GetList()
94
+
95
  if not file_list:
96
+ logger.warning(f"No files found in folder: {drive_folder_id}")
97
  return False, "No files found in the specified folder.", []
98
 
99
  existing_dataset, start_index = safe_load_dataset(self.dataset_name)
100
  renamed_files = []
101
+ processed_count = 0
102
+ error_count = 0
103
 
104
  for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
105
  if 'mimeType' in file and 'image' in file['mimeType'].lower():
106
+ new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg"
107
  file_path = os.path.join(self.local_images_dir, new_filename)
108
+
109
+ try:
110
+ file.GetContentFile(file_path)
111
+ if is_valid_image(file_path):
112
+ renamed_files.append({
113
+ 'file_path': file_path,
114
+ 'original_name': file['title'],
115
+ 'new_name': new_filename
116
+ })
117
+ processed_count += 1
118
+ logger.info(f"Successfully processed: {file['title']} -> {new_filename}")
119
+ else:
120
+ error_count += 1
121
+ if os.path.exists(file_path):
122
+ os.remove(file_path)
123
+ except Exception as e:
124
+ error_count += 1
125
+ logger.error(f"Error processing file {file['title']}: {str(e)}")
126
+ if os.path.exists(file_path):
127
+ os.remove(file_path)
128
+
129
+ status_message = f"Processed {processed_count} images successfully"
130
+ if error_count > 0:
131
+ status_message += f" ({error_count} files failed)"
132
 
133
+ return True, status_message, renamed_files
134
  except Exception as e:
135
+ logger.error(f"Download error: {str(e)}")
136
  return False, f"Error during download: {str(e)}", []
137
 
138
  def update_huggingface_dataset(self, renamed_files):
139
  """Update Hugging Face dataset with new images."""
140
+ if not renamed_files:
141
+ return False, "No files to update"
142
+
143
  try:
144
  df = pd.DataFrame(renamed_files)
145
  new_dataset = Dataset.from_pandas(df)
 
153
  combined_dataset.push_to_hub(self.dataset_name, split="train")
154
  return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
155
  except Exception as e:
156
+ logger.error(f"Dataset update error: {str(e)}")
157
  return False, f"Error updating Hugging Face dataset: {str(e)}"
158
 
159
  # Process Pipeline
160
  def process_pipeline(folder_id, naming_convention):
161
  """Main pipeline for processing images and updating dataset."""
162
+ # Validate input
163
+ is_valid, error_message = validate_input(folder_id, naming_convention)
164
+ if not is_valid:
165
+ return error_message, []
166
+
167
  manager = DatasetManager()
168
 
169
  # Step 1: Authenticate Google Drive
 
181
  return f"{message}\n{hf_message}", renamed_files
182
 
183
  # Gradio Interface
184
+ with gr.Blocks(title="Sports Cards Dataset Processor") as demo:
185
  gr.Markdown("# Sports Cards Dataset Processor")
186
 
187
  with gr.Box():
188
+ gr.Markdown("""
189
+ ### Instructions
190
+ 1. Enter the Google Drive folder ID (found in the folder's URL)
191
+ 2. Specify a naming convention for the files (e.g., 'sports_card')
192
+ 3. Click 'Process Images' to start
193
+
194
+ Note: Only image files will be processed. Invalid images will be skipped.
195
+ """)
196
 
197
  with gr.Row():
198
+ folder_id = gr.Textbox(
199
+ label="Google Drive Folder ID",
200
+ placeholder="Enter the folder ID from the URL",
201
+ info="Found in your Google Drive folder's URL"
202
+ )
203
+ naming_convention = gr.Textbox(
204
+ label="Naming Convention",
205
+ placeholder="e.g., sports_card",
206
+ value="sports_card",
207
+ info="Use only letters, numbers, and underscores"
208
+ )
209
+
210
+ process_btn = gr.Button("Process Images", variant="primary")
211
+
212
+ with gr.Row():
213
+ with gr.Column():
214
+ output = gr.Textbox(
215
+ label="Processing Status",
216
+ show_label=True,
217
+ lines=3
218
+ )
219
+
220
+ with gr.Column():
221
+ output_table = gr.Dataframe(
222
+ label="Processed Files",
223
+ headers=["Original Name", "New Name", "File Path"],
224
+ wrap=True
225
+ )
226
 
227
  def process_ui(folder_id, naming_convention):
228
  status, renamed_files = process_pipeline(folder_id, naming_convention)
229
+ table_data = [[file['original_name'], file['new_name'], file['file_path']]
230
+ for file in renamed_files] if renamed_files else []
231
  return status, table_data
232
 
233
+ process_btn.click(
234
+ fn=process_ui,
235
+ inputs=[folder_id, naming_convention],
236
+ outputs=[output, output_table]
237
+ )
238
 
239
  if __name__ == "__main__":
240
+ demo.launch()