GotThatData commited on
Commit
71ac033
·
verified ·
1 Parent(s): b4b0910
Files changed (1) hide show
  1. app.py +35 -186
app.py CHANGED
@@ -23,146 +23,13 @@ if not os.path.exists("settings.yaml"):
23
  with open('settings.yaml', 'r') as file:
24
  settings = yaml.safe_load(file)
25
 
26
- # Utility Functions
27
- def safe_load_dataset(dataset_name):
28
- """Load Hugging Face dataset safely."""
29
- try:
30
- dataset = load_dataset(dataset_name)
31
- return dataset, len(dataset['train']) if 'train' in dataset else 0
32
- except Exception as e:
33
- logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}")
34
- return None, 0
35
 
36
- def is_valid_image(file_path):
37
- """Check if a file is a valid image."""
38
- try:
39
- with Image.open(file_path) as img:
40
- img.verify()
41
- return True
42
- except Exception as e:
43
- logger.error(f"Invalid image: {file_path}. Error: {str(e)}")
44
- return False
45
-
46
- def validate_input(folder_id, naming_convention):
47
- """Validate user input."""
48
- if not folder_id or not folder_id.strip():
49
- return False, "Folder ID cannot be empty"
50
- if not naming_convention or not naming_convention.strip():
51
- return False, "Naming convention cannot be empty"
52
- if not naming_convention.replace('_', '').isalnum():
53
- return False, "Naming convention should only contain letters, numbers, and underscores"
54
- return True, ""
55
-
56
- # DatasetManager Class
57
- class DatasetManager:
58
- def __init__(self, local_images_dir="downloaded_cards"):
59
- self.local_images_dir = local_images_dir
60
- self.drive = None
61
- self.dataset_name = "GotThatData/sports-cards"
62
- os.makedirs(local_images_dir, exist_ok=True)
63
-
64
- def authenticate_drive(self):
65
- """Authenticate with Google Drive."""
66
- try:
67
- gauth = GoogleAuth()
68
- gauth.settings['client_config_file'] = settings['client_secrets_file']
69
-
70
- # Try to load saved credentials
71
- gauth.LoadCredentialsFile("credentials.txt")
72
- if gauth.credentials is None:
73
- gauth.LocalWebserverAuth()
74
- elif gauth.access_token_expired:
75
- gauth.Refresh()
76
- else:
77
- gauth.Authorize()
78
- gauth.SaveCredentialsFile("credentials.txt")
79
-
80
- self.drive = GoogleDrive(gauth)
81
- return True, "Successfully authenticated with Google Drive"
82
- except Exception as e:
83
- logger.error(f"Authentication failed: {str(e)}")
84
- return False, f"Authentication failed: {str(e)}"
85
-
86
- def download_and_rename_files(self, drive_folder_id, naming_convention):
87
- """Download files from Google Drive and rename them."""
88
- if not self.drive:
89
- return False, "Google Drive not authenticated", []
90
-
91
- try:
92
- query = f"'{drive_folder_id}' in parents and trashed=false"
93
- file_list = self.drive.ListFile({'q': query}).GetList()
94
-
95
- if not file_list:
96
- logger.warning(f"No files found in folder: {drive_folder_id}")
97
- return False, "No files found in the specified folder.", []
98
-
99
- existing_dataset, start_index = safe_load_dataset(self.dataset_name)
100
- renamed_files = []
101
- processed_count = 0
102
- error_count = 0
103
-
104
- for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
105
- if 'mimeType' in file and 'image' in file['mimeType'].lower():
106
- new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg"
107
- file_path = os.path.join(self.local_images_dir, new_filename)
108
-
109
- try:
110
- file.GetContentFile(file_path)
111
- if is_valid_image(file_path):
112
- renamed_files.append({
113
- 'file_path': file_path,
114
- 'original_name': file['title'],
115
- 'new_name': new_filename
116
- })
117
- processed_count += 1
118
- logger.info(f"Successfully processed: {file['title']} -> {new_filename}")
119
- else:
120
- error_count += 1
121
- if os.path.exists(file_path):
122
- os.remove(file_path)
123
- except Exception as e:
124
- error_count += 1
125
- logger.error(f"Error processing file {file['title']}: {str(e)}")
126
- if os.path.exists(file_path):
127
- os.remove(file_path)
128
-
129
- status_message = f"Processed {processed_count} images successfully"
130
- if error_count > 0:
131
- status_message += f" ({error_count} files failed)"
132
-
133
- return True, status_message, renamed_files
134
- except Exception as e:
135
- logger.error(f"Download error: {str(e)}")
136
- return False, f"Error during download: {str(e)}", []
137
-
138
- def update_huggingface_dataset(self, renamed_files):
139
- """Update Hugging Face dataset with new images."""
140
- if not renamed_files:
141
- return False, "No files to update"
142
-
143
- try:
144
- df = pd.DataFrame(renamed_files)
145
- new_dataset = Dataset.from_pandas(df)
146
-
147
- existing_dataset, _ = safe_load_dataset(self.dataset_name)
148
- if existing_dataset and 'train' in existing_dataset:
149
- combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
150
- else:
151
- combined_dataset = new_dataset
152
-
153
- combined_dataset.push_to_hub(self.dataset_name, split="train")
154
- return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
155
- except Exception as e:
156
- logger.error(f"Dataset update error: {str(e)}")
157
- return False, f"Error updating Hugging Face dataset: {str(e)}"
158
-
159
- # Process Pipeline
160
  def process_pipeline(folder_id, naming_convention):
161
  """Main pipeline for processing images and updating dataset."""
162
  # Validate input
163
- is_valid, error_message = validate_input(folder_id, naming_convention)
164
- if not is_valid:
165
- return error_message, []
166
 
167
  manager = DatasetManager()
168
 
@@ -180,61 +47,43 @@ def process_pipeline(folder_id, naming_convention):
180
  success, hf_message = manager.update_huggingface_dataset(renamed_files)
181
  return f"{message}\n{hf_message}", renamed_files
182
 
183
- # Gradio Interface
184
- with gr.Blocks(title="Sports Cards Dataset Processor") as demo:
185
- gr.Markdown("# Sports Cards Dataset Processor")
186
-
187
- with gr.Box():
188
- gr.Markdown("""
189
- ### Instructions
190
- 1. Enter the Google Drive folder ID (found in the folder's URL)
191
- 2. Specify a naming convention for the files (e.g., 'sports_card')
192
- 3. Click 'Process Images' to start
193
-
194
- Note: Only image files will be processed. Invalid images will be skipped.
195
- """)
196
-
197
- with gr.Row():
198
- folder_id = gr.Textbox(
199
  label="Google Drive Folder ID",
200
- placeholder="Enter the folder ID from the URL",
201
- info="Found in your Google Drive folder's URL"
202
- )
203
- naming_convention = gr.Textbox(
204
  label="Naming Convention",
205
  placeholder="e.g., sports_card",
206
- value="sports_card",
207
- info="Use only letters, numbers, and underscores"
208
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- process_btn = gr.Button("Process Images", variant="primary")
211
-
212
- with gr.Row():
213
- with gr.Column():
214
- output = gr.Textbox(
215
- label="Processing Status",
216
- show_label=True,
217
- lines=3
218
- )
219
-
220
- with gr.Column():
221
- output_table = gr.Dataframe(
222
- label="Processed Files",
223
- headers=["Original Name", "New Name", "File Path"],
224
- wrap=True
225
- )
226
-
227
- def process_ui(folder_id, naming_convention):
228
- status, renamed_files = process_pipeline(folder_id, naming_convention)
229
- table_data = [[file['original_name'], file['new_name'], file['file_path']]
230
- for file in renamed_files] if renamed_files else []
231
- return status, table_data
232
-
233
- process_btn.click(
234
- fn=process_ui,
235
- inputs=[folder_id, naming_convention],
236
- outputs=[output, output_table]
237
- )
238
 
239
  if __name__ == "__main__":
240
  demo.launch()
 
23
  with open('settings.yaml', 'r') as file:
24
  settings = yaml.safe_load(file)
25
 
26
+ [... keep all the utility functions and DatasetManager class the same ...]
 
 
 
 
 
 
 
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def process_pipeline(folder_id, naming_convention):
29
  """Main pipeline for processing images and updating dataset."""
30
  # Validate input
31
+ if not folder_id or not naming_convention:
32
+ return "Please provide both folder ID and naming convention", []
 
33
 
34
  manager = DatasetManager()
35
 
 
47
  success, hf_message = manager.update_huggingface_dataset(renamed_files)
48
  return f"{message}\n{hf_message}", renamed_files
49
 
50
+ def process_ui(folder_id, naming_convention):
51
+ """UI handler for the process pipeline"""
52
+ status, renamed_files = process_pipeline(folder_id, naming_convention)
53
+ table_data = [[file['original_name'], file['new_name'], file['file_path']]
54
+ for file in renamed_files] if renamed_files else []
55
+ return status, table_data
56
+
57
+ # Simplified Gradio interface
58
+ demo = gr.Interface(
59
+ fn=process_ui,
60
+ inputs=[
61
+ gr.Textbox(
 
 
 
 
62
  label="Google Drive Folder ID",
63
+ placeholder="Enter the folder ID from the URL"
64
+ ),
65
+ gr.Textbox(
 
66
  label="Naming Convention",
67
  placeholder="e.g., sports_card",
68
+ value="sports_card"
 
69
  )
70
+ ],
71
+ outputs=[
72
+ gr.Textbox(label="Status"),
73
+ gr.Dataframe(
74
+ headers=["Original Name", "New Name", "File Path"]
75
+ )
76
+ ],
77
+ title="Sports Cards Dataset Processor",
78
+ description="""
79
+ Instructions:
80
+ 1. Enter the Google Drive folder ID (found in the folder's URL)
81
+ 2. Specify a naming convention for the files (e.g., 'sports_card')
82
+ 3. Click submit to start processing
83
 
84
+ Note: Only image files will be processed. Invalid images will be skipped.
85
+ """
86
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  if __name__ == "__main__":
89
  demo.launch()