GotThatData commited on
Commit
b489b3f
·
verified ·
1 Parent(s): 8067321
Files changed (1) hide show
  1. app.py +81 -121
app.py CHANGED
@@ -2,7 +2,7 @@ from pydrive2.auth import GoogleAuth
2
  from pydrive2.drive import GoogleDrive
3
  import os
4
  import gradio as gr
5
- from datasets import load_dataset, Dataset
6
  import pandas as pd
7
  from PIL import Image
8
  from tqdm import tqdm
@@ -14,34 +14,53 @@ logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
  # Load settings
 
 
 
17
  with open('settings.yaml', 'r') as file:
18
  settings = yaml.safe_load(file)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  class DatasetManager:
21
  def __init__(self, local_images_dir="downloaded_cards"):
22
  self.local_images_dir = local_images_dir
23
  self.drive = None
24
  self.dataset_name = "GotThatData/sports-cards"
25
-
26
- # Create local directory if it doesn't exist
27
  os.makedirs(local_images_dir, exist_ok=True)
28
-
29
  def authenticate_drive(self):
30
- """Authenticate with Google Drive"""
31
  try:
32
  gauth = GoogleAuth()
33
  gauth.settings['client_config_file'] = settings['client_secrets_file']
34
 
35
  # Try to load saved credentials
36
  gauth.LoadCredentialsFile("credentials.txt")
37
-
38
  if gauth.credentials is None:
39
  gauth.LocalWebserverAuth()
40
  elif gauth.access_token_expired:
41
  gauth.Refresh()
42
  else:
43
  gauth.Authorize()
44
-
45
  gauth.SaveCredentialsFile("credentials.txt")
46
 
47
  self.drive = GoogleDrive(gauth)
@@ -50,156 +69,97 @@ class DatasetManager:
50
  return False, f"Authentication failed: {str(e)}"
51
 
52
  def download_and_rename_files(self, drive_folder_id, naming_convention):
53
- """Download files from Google Drive and rename them"""
54
  if not self.drive:
55
  return False, "Google Drive not authenticated", []
56
 
57
  try:
58
  query = f"'{drive_folder_id}' in parents and trashed=false"
59
  file_list = self.drive.ListFile({'q': query}).GetList()
60
-
61
  if not file_list:
62
- file = self.drive.CreateFile({'id': drive_folder_id})
63
- if file:
64
- file_list = [file]
65
- else:
66
- return False, "No files found with the specified ID", []
67
-
68
  renamed_files = []
69
- try:
70
- existing_dataset = load_dataset(self.dataset_name)
71
- logger.info(f"Loaded existing dataset: {self.dataset_name}")
72
- start_index = len(existing_dataset['train']) if 'train' in existing_dataset else 0
73
- except Exception as e:
74
- logger.info(f"No existing dataset found, starting fresh: {str(e)}")
75
- start_index = 0
76
-
77
- for i, file in enumerate(tqdm(file_list, desc="Downloading files")):
78
- if file['mimeType'].startswith('image/'):
79
  new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
80
  file_path = os.path.join(self.local_images_dir, new_filename)
81
-
82
  file.GetContentFile(file_path)
83
-
84
- try:
85
- with Image.open(file_path) as img:
86
- img.verify()
87
  renamed_files.append({
88
  'file_path': file_path,
89
  'original_name': file['title'],
90
- 'new_name': new_filename,
91
- 'image': file_path
92
  })
93
- except Exception as e:
94
- logger.error(f"Error processing image {file['title']}: {str(e)}")
95
- if os.path.exists(file_path):
96
- os.remove(file_path)
97
 
98
- return True, f"Successfully processed {len(renamed_files)} images", renamed_files
99
  except Exception as e:
100
- return False, f"Error downloading files: {str(e)}", []
101
 
102
  def update_huggingface_dataset(self, renamed_files):
103
- """Update the sports-cards dataset with new images"""
104
  try:
105
  df = pd.DataFrame(renamed_files)
106
  new_dataset = Dataset.from_pandas(df)
 
 
 
 
 
 
107
 
108
- try:
109
- existing_dataset = load_dataset(self.dataset_name)
110
- if 'train' in existing_dataset:
111
- new_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
112
- except Exception:
113
- logger.info("Creating new dataset")
114
-
115
- new_dataset.push_to_hub(self.dataset_name, split="train")
116
-
117
- return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images"
118
  except Exception as e:
119
  return False, f"Error updating Hugging Face dataset: {str(e)}"
120
 
 
121
  def process_pipeline(folder_id, naming_convention):
122
- """Main pipeline to process images and update dataset"""
123
  manager = DatasetManager()
124
-
 
125
  auth_success, auth_message = manager.authenticate_drive()
126
  if not auth_success:
127
- return auth_message
128
-
 
129
  success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
130
  if not success:
131
- return message
132
-
 
133
  success, hf_message = manager.update_huggingface_dataset(renamed_files)
134
- return f"{message}\n{hf_message}"
135
-
136
- # Custom CSS for web-safe fonts and clean styling
137
- custom_css = """
138
- .gradio-container {
139
- font-family: Arial, sans-serif !important;
140
- }
141
-
142
- h1, h2, h3 {
143
- font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
144
- font-weight: 600 !important;
145
- }
146
-
147
- .gr-button {
148
- font-family: Arial, sans-serif !important;
149
- }
150
-
151
- .gr-input {
152
- font-family: 'Courier New', Courier, monospace !important;
153
- }
154
-
155
- .gr-box {
156
- border-radius: 8px !important;
157
- border: 1px solid #e5e5e5 !important;
158
- }
159
-
160
- .gr-padded {
161
- padding: 16px !important;
162
- }
163
- """
164
-
165
- # Gradio interface with custom theme
166
- with gr.Blocks(css=custom_css) as demo:
167
  gr.Markdown("# Sports Cards Dataset Processor")
168
-
169
  with gr.Box():
170
- gr.Markdown("""
171
- ### Instructions
172
- 1. Enter the Google Drive folder/file ID
173
- 2. Choose a naming convention for your cards
174
- 3. Click Process to start
175
- """)
176
 
177
  with gr.Row():
178
- with gr.Column():
179
- folder_id = gr.Textbox(
180
- label="Google Drive File/Folder ID",
181
- placeholder="Enter the ID from your Google Drive URL",
182
- value="151VOxPO91mg0C3ORiioGUd4hogzP1ujm"
183
- )
184
- naming = gr.Textbox(
185
- label="Naming Convention",
186
- placeholder="e.g., sports_card",
187
- value="sports_card"
188
- )
189
- process_btn = gr.Button("Process Images", variant="primary")
190
-
191
- with gr.Box():
192
- output = gr.Textbox(
193
- label="Processing Status",
194
- show_label=True,
195
- lines=5
196
- )
197
-
198
- process_btn.click(
199
- fn=process_pipeline,
200
- inputs=[folder_id, naming],
201
- outputs=output
202
- )
203
 
204
  if __name__ == "__main__":
205
- demo.launch()
 
2
  from pydrive2.drive import GoogleDrive
3
  import os
4
  import gradio as gr
5
+ from datasets import load_dataset, Dataset, concatenate_datasets
6
  import pandas as pd
7
  from PIL import Image
8
  from tqdm import tqdm
 
14
  logger = logging.getLogger(__name__)
15
 
16
  # Load settings
17
+ if not os.path.exists("settings.yaml"):
18
+ raise FileNotFoundError("settings.yaml file is missing. Please add it with 'client_secrets_file'.")
19
+
20
  with open('settings.yaml', 'r') as file:
21
  settings = yaml.safe_load(file)
22
 
23
+ # Utility Functions
24
+ def safe_load_dataset(dataset_name):
25
+ """Load Hugging Face dataset safely."""
26
+ try:
27
+ dataset = load_dataset(dataset_name)
28
+ return dataset, len(dataset['train']) if 'train' in dataset else 0
29
+ except Exception:
30
+ logger.info("No existing dataset found. Starting fresh.")
31
+ return None, 0
32
+
33
+ def is_valid_image(file_path):
34
+ """Check if a file is a valid image."""
35
+ try:
36
+ with Image.open(file_path) as img:
37
+ img.verify()
38
+ return True
39
+ except:
40
+ return False
41
+
42
+ # DatasetManager Class
43
  class DatasetManager:
44
  def __init__(self, local_images_dir="downloaded_cards"):
45
  self.local_images_dir = local_images_dir
46
  self.drive = None
47
  self.dataset_name = "GotThatData/sports-cards"
 
 
48
  os.makedirs(local_images_dir, exist_ok=True)
49
+
50
  def authenticate_drive(self):
51
+ """Authenticate with Google Drive."""
52
  try:
53
  gauth = GoogleAuth()
54
  gauth.settings['client_config_file'] = settings['client_secrets_file']
55
 
56
  # Try to load saved credentials
57
  gauth.LoadCredentialsFile("credentials.txt")
 
58
  if gauth.credentials is None:
59
  gauth.LocalWebserverAuth()
60
  elif gauth.access_token_expired:
61
  gauth.Refresh()
62
  else:
63
  gauth.Authorize()
 
64
  gauth.SaveCredentialsFile("credentials.txt")
65
 
66
  self.drive = GoogleDrive(gauth)
 
69
  return False, f"Authentication failed: {str(e)}"
70
 
71
  def download_and_rename_files(self, drive_folder_id, naming_convention):
72
+ """Download files from Google Drive and rename them."""
73
  if not self.drive:
74
  return False, "Google Drive not authenticated", []
75
 
76
  try:
77
  query = f"'{drive_folder_id}' in parents and trashed=false"
78
  file_list = self.drive.ListFile({'q': query}).GetList()
 
79
  if not file_list:
80
+ return False, "No files found in the specified folder.", []
81
+
82
+ existing_dataset, start_index = safe_load_dataset(self.dataset_name)
 
 
 
83
  renamed_files = []
84
+
85
+ for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
86
+ if 'mimeType' in file and 'image' in file['mimeType'].lower():
 
 
 
 
 
 
 
87
  new_filename = f"{naming_convention}_{start_index + i + 1}.jpg"
88
  file_path = os.path.join(self.local_images_dir, new_filename)
 
89
  file.GetContentFile(file_path)
90
+
91
+ if is_valid_image(file_path):
 
 
92
  renamed_files.append({
93
  'file_path': file_path,
94
  'original_name': file['title'],
95
+ 'new_name': new_filename
 
96
  })
97
+ logger.info(f"Downloaded and renamed: {file['title']} -> {new_filename}")
98
+ else:
99
+ logger.error(f"Invalid image detected, removing {file_path}")
100
+ os.remove(file_path)
101
 
102
+ return True, f"Processed {len(renamed_files)} images", renamed_files
103
  except Exception as e:
104
+ return False, f"Error during download: {str(e)}", []
105
 
106
  def update_huggingface_dataset(self, renamed_files):
107
+ """Update Hugging Face dataset with new images."""
108
  try:
109
  df = pd.DataFrame(renamed_files)
110
  new_dataset = Dataset.from_pandas(df)
111
+
112
+ existing_dataset, _ = safe_load_dataset(self.dataset_name)
113
+ if existing_dataset and 'train' in existing_dataset:
114
+ combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
115
+ else:
116
+ combined_dataset = new_dataset
117
 
118
+ combined_dataset.push_to_hub(self.dataset_name, split="train")
119
+ return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
  return False, f"Error updating Hugging Face dataset: {str(e)}"
122
 
123
+ # Process Pipeline
124
  def process_pipeline(folder_id, naming_convention):
125
+ """Main pipeline for processing images and updating dataset."""
126
  manager = DatasetManager()
127
+
128
+ # Step 1: Authenticate Google Drive
129
  auth_success, auth_message = manager.authenticate_drive()
130
  if not auth_success:
131
+ return auth_message, []
132
+
133
+ # Step 2: Download and rename files
134
  success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention)
135
  if not success:
136
+ return message, []
137
+
138
+ # Step 3: Update Hugging Face dataset
139
  success, hf_message = manager.update_huggingface_dataset(renamed_files)
140
+ return f"{message}\n{hf_message}", renamed_files
141
+
142
+ # Gradio Interface
143
+ with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  gr.Markdown("# Sports Cards Dataset Processor")
145
+
146
  with gr.Box():
147
+ gr.Markdown("### Instructions: Upload from Google Drive and Update Hugging Face Dataset")
 
 
 
 
 
148
 
149
  with gr.Row():
150
+ folder_id = gr.Textbox(label="Google Drive Folder ID", placeholder="Enter the folder ID")
151
+ naming_convention = gr.Textbox(label="Naming Convention", placeholder="e.g., sports_card")
152
+ process_btn = gr.Button("Process Images")
153
+
154
+ output = gr.Textbox(label="Status")
155
+ output_table = gr.Dataframe(label="Processed Files", headers=["Original Name", "New Name", "File Path"])
156
+
157
+ def process_ui(folder_id, naming_convention):
158
+ status, renamed_files = process_pipeline(folder_id, naming_convention)
159
+ table_data = [[file['original_name'], file['new_name'], file['file_path']] for file in renamed_files]
160
+ return status, table_data
161
+
162
+ process_btn.click(process_ui, inputs=[folder_id, naming_convention], outputs=[output, output_table])
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  if __name__ == "__main__":
165
+ demo.launch()