Spaces:
Sleeping
Sleeping
GotThatData
commited on
Update
Browse files
app.py
CHANGED
@@ -10,7 +10,10 @@ import logging
|
|
10 |
import yaml
|
11 |
|
12 |
# Set up logging
|
13 |
-
logging.basicConfig(
|
|
|
|
|
|
|
14 |
logger = logging.getLogger(__name__)
|
15 |
|
16 |
# Load settings
|
@@ -26,8 +29,8 @@ def safe_load_dataset(dataset_name):
|
|
26 |
try:
|
27 |
dataset = load_dataset(dataset_name)
|
28 |
return dataset, len(dataset['train']) if 'train' in dataset else 0
|
29 |
-
except Exception:
|
30 |
-
logger.info("No existing dataset found. Starting fresh.")
|
31 |
return None, 0
|
32 |
|
33 |
def is_valid_image(file_path):
|
@@ -35,10 +38,21 @@ def is_valid_image(file_path):
|
|
35 |
try:
|
36 |
with Image.open(file_path) as img:
|
37 |
img.verify()
|
38 |
-
|
39 |
-
except:
|
|
|
40 |
return False
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# DatasetManager Class
|
43 |
class DatasetManager:
|
44 |
def __init__(self, local_images_dir="downloaded_cards"):
|
@@ -66,6 +80,7 @@ class DatasetManager:
|
|
66 |
self.drive = GoogleDrive(gauth)
|
67 |
return True, "Successfully authenticated with Google Drive"
|
68 |
except Exception as e:
|
|
|
69 |
return False, f"Authentication failed: {str(e)}"
|
70 |
|
71 |
def download_and_rename_files(self, drive_folder_id, naming_convention):
|
@@ -76,35 +91,55 @@ class DatasetManager:
|
|
76 |
try:
|
77 |
query = f"'{drive_folder_id}' in parents and trashed=false"
|
78 |
file_list = self.drive.ListFile({'q': query}).GetList()
|
|
|
79 |
if not file_list:
|
|
|
80 |
return False, "No files found in the specified folder.", []
|
81 |
|
82 |
existing_dataset, start_index = safe_load_dataset(self.dataset_name)
|
83 |
renamed_files = []
|
|
|
|
|
84 |
|
85 |
for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
|
86 |
if 'mimeType' in file and 'image' in file['mimeType'].lower():
|
87 |
-
new_filename = f"{naming_convention}_{start_index +
|
88 |
file_path = os.path.join(self.local_images_dir, new_filename)
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
return True,
|
103 |
except Exception as e:
|
|
|
104 |
return False, f"Error during download: {str(e)}", []
|
105 |
|
106 |
def update_huggingface_dataset(self, renamed_files):
|
107 |
"""Update Hugging Face dataset with new images."""
|
|
|
|
|
|
|
108 |
try:
|
109 |
df = pd.DataFrame(renamed_files)
|
110 |
new_dataset = Dataset.from_pandas(df)
|
@@ -118,11 +153,17 @@ class DatasetManager:
|
|
118 |
combined_dataset.push_to_hub(self.dataset_name, split="train")
|
119 |
return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
|
120 |
except Exception as e:
|
|
|
121 |
return False, f"Error updating Hugging Face dataset: {str(e)}"
|
122 |
|
123 |
# Process Pipeline
|
124 |
def process_pipeline(folder_id, naming_convention):
|
125 |
"""Main pipeline for processing images and updating dataset."""
|
|
|
|
|
|
|
|
|
|
|
126 |
manager = DatasetManager()
|
127 |
|
128 |
# Step 1: Authenticate Google Drive
|
@@ -140,26 +181,60 @@ def process_pipeline(folder_id, naming_convention):
|
|
140 |
return f"{message}\n{hf_message}", renamed_files
|
141 |
|
142 |
# Gradio Interface
|
143 |
-
with gr.Blocks() as demo:
|
144 |
gr.Markdown("# Sports Cards Dataset Processor")
|
145 |
|
146 |
with gr.Box():
|
147 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
with gr.Row():
|
150 |
-
folder_id = gr.Textbox(
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
def process_ui(folder_id, naming_convention):
|
158 |
status, renamed_files = process_pipeline(folder_id, naming_convention)
|
159 |
-
table_data = [[file['original_name'], file['new_name'], file['file_path']]
|
|
|
160 |
return status, table_data
|
161 |
|
162 |
-
process_btn.click(
|
|
|
|
|
|
|
|
|
163 |
|
164 |
if __name__ == "__main__":
|
165 |
-
demo.launch()
|
|
|
10 |
import yaml
|
11 |
|
12 |
# Set up logging
|
13 |
+
logging.basicConfig(
|
14 |
+
level=logging.INFO,
|
15 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
16 |
+
)
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
# Load settings
|
|
|
29 |
try:
|
30 |
dataset = load_dataset(dataset_name)
|
31 |
return dataset, len(dataset['train']) if 'train' in dataset else 0
|
32 |
+
except Exception as e:
|
33 |
+
logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}")
|
34 |
return None, 0
|
35 |
|
36 |
def is_valid_image(file_path):
|
|
|
38 |
try:
|
39 |
with Image.open(file_path) as img:
|
40 |
img.verify()
|
41 |
+
return True
|
42 |
+
except Exception as e:
|
43 |
+
logger.error(f"Invalid image: {file_path}. Error: {str(e)}")
|
44 |
return False
|
45 |
|
46 |
+
def validate_input(folder_id, naming_convention):
|
47 |
+
"""Validate user input."""
|
48 |
+
if not folder_id or not folder_id.strip():
|
49 |
+
return False, "Folder ID cannot be empty"
|
50 |
+
if not naming_convention or not naming_convention.strip():
|
51 |
+
return False, "Naming convention cannot be empty"
|
52 |
+
if not naming_convention.replace('_', '').isalnum():
|
53 |
+
return False, "Naming convention should only contain letters, numbers, and underscores"
|
54 |
+
return True, ""
|
55 |
+
|
56 |
# DatasetManager Class
|
57 |
class DatasetManager:
|
58 |
def __init__(self, local_images_dir="downloaded_cards"):
|
|
|
80 |
self.drive = GoogleDrive(gauth)
|
81 |
return True, "Successfully authenticated with Google Drive"
|
82 |
except Exception as e:
|
83 |
+
logger.error(f"Authentication failed: {str(e)}")
|
84 |
return False, f"Authentication failed: {str(e)}"
|
85 |
|
86 |
def download_and_rename_files(self, drive_folder_id, naming_convention):
|
|
|
91 |
try:
|
92 |
query = f"'{drive_folder_id}' in parents and trashed=false"
|
93 |
file_list = self.drive.ListFile({'q': query}).GetList()
|
94 |
+
|
95 |
if not file_list:
|
96 |
+
logger.warning(f"No files found in folder: {drive_folder_id}")
|
97 |
return False, "No files found in the specified folder.", []
|
98 |
|
99 |
existing_dataset, start_index = safe_load_dataset(self.dataset_name)
|
100 |
renamed_files = []
|
101 |
+
processed_count = 0
|
102 |
+
error_count = 0
|
103 |
|
104 |
for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
|
105 |
if 'mimeType' in file and 'image' in file['mimeType'].lower():
|
106 |
+
new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg"
|
107 |
file_path = os.path.join(self.local_images_dir, new_filename)
|
108 |
+
|
109 |
+
try:
|
110 |
+
file.GetContentFile(file_path)
|
111 |
+
if is_valid_image(file_path):
|
112 |
+
renamed_files.append({
|
113 |
+
'file_path': file_path,
|
114 |
+
'original_name': file['title'],
|
115 |
+
'new_name': new_filename
|
116 |
+
})
|
117 |
+
processed_count += 1
|
118 |
+
logger.info(f"Successfully processed: {file['title']} -> {new_filename}")
|
119 |
+
else:
|
120 |
+
error_count += 1
|
121 |
+
if os.path.exists(file_path):
|
122 |
+
os.remove(file_path)
|
123 |
+
except Exception as e:
|
124 |
+
error_count += 1
|
125 |
+
logger.error(f"Error processing file {file['title']}: {str(e)}")
|
126 |
+
if os.path.exists(file_path):
|
127 |
+
os.remove(file_path)
|
128 |
+
|
129 |
+
status_message = f"Processed {processed_count} images successfully"
|
130 |
+
if error_count > 0:
|
131 |
+
status_message += f" ({error_count} files failed)"
|
132 |
|
133 |
+
return True, status_message, renamed_files
|
134 |
except Exception as e:
|
135 |
+
logger.error(f"Download error: {str(e)}")
|
136 |
return False, f"Error during download: {str(e)}", []
|
137 |
|
138 |
def update_huggingface_dataset(self, renamed_files):
|
139 |
"""Update Hugging Face dataset with new images."""
|
140 |
+
if not renamed_files:
|
141 |
+
return False, "No files to update"
|
142 |
+
|
143 |
try:
|
144 |
df = pd.DataFrame(renamed_files)
|
145 |
new_dataset = Dataset.from_pandas(df)
|
|
|
153 |
combined_dataset.push_to_hub(self.dataset_name, split="train")
|
154 |
return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
|
155 |
except Exception as e:
|
156 |
+
logger.error(f"Dataset update error: {str(e)}")
|
157 |
return False, f"Error updating Hugging Face dataset: {str(e)}"
|
158 |
|
159 |
# Process Pipeline
|
160 |
def process_pipeline(folder_id, naming_convention):
|
161 |
"""Main pipeline for processing images and updating dataset."""
|
162 |
+
# Validate input
|
163 |
+
is_valid, error_message = validate_input(folder_id, naming_convention)
|
164 |
+
if not is_valid:
|
165 |
+
return error_message, []
|
166 |
+
|
167 |
manager = DatasetManager()
|
168 |
|
169 |
# Step 1: Authenticate Google Drive
|
|
|
181 |
return f"{message}\n{hf_message}", renamed_files
|
182 |
|
183 |
# Gradio Interface
|
184 |
+
with gr.Blocks(title="Sports Cards Dataset Processor") as demo:
|
185 |
gr.Markdown("# Sports Cards Dataset Processor")
|
186 |
|
187 |
with gr.Box():
|
188 |
+
gr.Markdown("""
|
189 |
+
### Instructions
|
190 |
+
1. Enter the Google Drive folder ID (found in the folder's URL)
|
191 |
+
2. Specify a naming convention for the files (e.g., 'sports_card')
|
192 |
+
3. Click 'Process Images' to start
|
193 |
+
|
194 |
+
Note: Only image files will be processed. Invalid images will be skipped.
|
195 |
+
""")
|
196 |
|
197 |
with gr.Row():
|
198 |
+
folder_id = gr.Textbox(
|
199 |
+
label="Google Drive Folder ID",
|
200 |
+
placeholder="Enter the folder ID from the URL",
|
201 |
+
info="Found in your Google Drive folder's URL"
|
202 |
+
)
|
203 |
+
naming_convention = gr.Textbox(
|
204 |
+
label="Naming Convention",
|
205 |
+
placeholder="e.g., sports_card",
|
206 |
+
value="sports_card",
|
207 |
+
info="Use only letters, numbers, and underscores"
|
208 |
+
)
|
209 |
+
|
210 |
+
process_btn = gr.Button("Process Images", variant="primary")
|
211 |
+
|
212 |
+
with gr.Row():
|
213 |
+
with gr.Column():
|
214 |
+
output = gr.Textbox(
|
215 |
+
label="Processing Status",
|
216 |
+
show_label=True,
|
217 |
+
lines=3
|
218 |
+
)
|
219 |
+
|
220 |
+
with gr.Column():
|
221 |
+
output_table = gr.Dataframe(
|
222 |
+
label="Processed Files",
|
223 |
+
headers=["Original Name", "New Name", "File Path"],
|
224 |
+
wrap=True
|
225 |
+
)
|
226 |
|
227 |
def process_ui(folder_id, naming_convention):
|
228 |
status, renamed_files = process_pipeline(folder_id, naming_convention)
|
229 |
+
table_data = [[file['original_name'], file['new_name'], file['file_path']]
|
230 |
+
for file in renamed_files] if renamed_files else []
|
231 |
return status, table_data
|
232 |
|
233 |
+
process_btn.click(
|
234 |
+
fn=process_ui,
|
235 |
+
inputs=[folder_id, naming_convention],
|
236 |
+
outputs=[output, output_table]
|
237 |
+
)
|
238 |
|
239 |
if __name__ == "__main__":
|
240 |
+
demo.launch()
|