from pydrive2.auth import GoogleAuth from pydrive2.drive import GoogleDrive import os import gradio as gr from datasets import load_dataset, Dataset, concatenate_datasets import pandas as pd from PIL import Image from tqdm import tqdm import logging import yaml from huggingface_hub import login import json # Add this import # Authenticate with Hugging Face HF_TOKEN = os.getenv('HF_TOKEN') if HF_TOKEN: login(token=HF_TOKEN) else: logger.warning("No Hugging Face token found. Please add HF_TOKEN to your Space secrets.") # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Load settings if not os.path.exists("settings.yaml"): raise FileNotFoundError("settings.yaml file is missing. Please add it with 'client_secrets_file'.") with open('settings.yaml', 'r') as file: settings = yaml.safe_load(file) # Utility Functions def safe_load_dataset(dataset_name): """Load Hugging Face dataset safely.""" try: dataset = load_dataset(dataset_name) return dataset, len(dataset['train']) if 'train' in dataset else 0 except Exception as e: logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}") return None, 0 def is_valid_image(file_path): """Check if a file is a valid image.""" try: with Image.open(file_path) as img: img.verify() return True except Exception as e: logger.error(f"Invalid image: {file_path}. Error: {str(e)}") return False def validate_input(folder_id, naming_convention): """Validate user input.""" if not folder_id or not folder_id.strip(): return False, "Folder ID cannot be empty" if not naming_convention or not naming_convention.strip(): return False, "Naming convention cannot be empty" if not naming_convention.replace('_', '').isalnum(): return False, "Naming convention should only contain letters, numbers, and underscores" return True, "" def initialize_dataset(): """Initialize or verify the dataset structure.""" try: # Check if the README.md exists, if not create it readme_content = """# Sports Cards Dataset This dataset contains sports card images with structured metadata. Each image is named using a consistent convention and includes relevant information about the card. ## Dataset Structure ``` sports_card_{number}.jpg - Card images ``` ## Features - file_path: Path to the image file - original_name: Original filename of the card - new_name: Standardized filename - image: Image data ## Usage ```python from datasets import load_dataset dataset = load_dataset("GotThatData/sports-cards") ``` ## License This dataset is licensed under MIT. ## Creator Created by GotThatData """ # Create dataset info content dataset_info = { "description": "A collection of sports card images with metadata", "citation": "", "homepage": "https://huggingface.co/datasets/GotThatData/sports-cards", "license": "mit", "features": { "file_path": {"dtype": "string", "_type": "Value"}, "original_name": {"dtype": "string", "_type": "Value"}, "new_name": {"dtype": "string", "_type": "Value"}, "image": {"dtype": "string", "_type": "Value"} }, "splits": ["train"] } # Write files with open("README.md", "w") as f: f.write(readme_content) with open("dataset-info.json", "w") as f: json.dump(dataset_info, f, indent=2) # Upload files to repository upload_file( path_or_fileobj="README.md", path_in_repo="README.md", repo_id="GotThatData/sports-cards", repo_type="dataset" ) upload_file( path_or_fileobj="dataset-info.json", path_in_repo="dataset-info.json", repo_id="GotThatData/sports-cards", repo_type="dataset" ) return True, "Dataset structure initialized successfully" except Exception as e: return False, f"Failed to initialize dataset: {str(e)}" # DatasetManager Class class DatasetManager: def __init__(self, local_images_dir="downloaded_cards"): self.local_images_dir = local_images_dir self.drive = None self.dataset_name = "GotThatData/sports-cards" os.makedirs(local_images_dir, exist_ok=True) # Initialize dataset structure success, message = initialize_dataset() if not success: logger.warning(f"Dataset initialization warning: {message}") def authenticate_drive(self): """Authenticate with Google Drive.""" try: gauth = GoogleAuth() gauth.settings['client_config_file'] = settings['client_secrets_file'] # Try to load saved credentials gauth.LoadCredentialsFile("credentials.txt") if gauth.credentials is None: gauth.LocalWebserverAuth() elif gauth.access_token_expired: gauth.Refresh() else: gauth.Authorize() gauth.SaveCredentialsFile("credentials.txt") self.drive = GoogleDrive(gauth) return True, "Successfully authenticated with Google Drive" except Exception as e: logger.error(f"Authentication failed: {str(e)}") return False, f"Authentication failed: {str(e)}" def download_and_rename_files(self, drive_folder_id, naming_convention): """Download files from Google Drive and rename them.""" if not self.drive: return False, "Google Drive not authenticated", [] try: query = f"'{drive_folder_id}' in parents and trashed=false" file_list = self.drive.ListFile({'q': query}).GetList() if not file_list: logger.warning(f"No files found in folder: {drive_folder_id}") return False, "No files found in the specified folder.", [] existing_dataset, start_index = safe_load_dataset(self.dataset_name) renamed_files = [] processed_count = 0 error_count = 0 for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")): if 'mimeType' in file and 'image' in file['mimeType'].lower(): new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg" file_path = os.path.join(self.local_images_dir, new_filename) try: file.GetContentFile(file_path) if is_valid_image(file_path): renamed_files.append({ 'file_path': file_path, 'original_name': file['title'], 'new_name': new_filename }) processed_count += 1 logger.info(f"Successfully processed: {file['title']} -> {new_filename}") else: error_count += 1 if os.path.exists(file_path): os.remove(file_path) except Exception as e: error_count += 1 logger.error(f"Error processing file {file['title']}: {str(e)}") if os.path.exists(file_path): os.remove(file_path) status_message = f"Processed {processed_count} images successfully" if error_count > 0: status_message += f" ({error_count} files failed)" return True, status_message, renamed_files except Exception as e: logger.error(f"Download error: {str(e)}") return False, f"Error during download: {str(e)}", [] def update_huggingface_dataset(self, renamed_files): """Update Hugging Face dataset with new images.""" if not renamed_files: return False, "No files to update" try: df = pd.DataFrame(renamed_files) new_dataset = Dataset.from_pandas(df) existing_dataset, _ = safe_load_dataset(self.dataset_name) if existing_dataset and 'train' in existing_dataset: combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset]) else: combined_dataset = new_dataset combined_dataset.push_to_hub(self.dataset_name, split="train") return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images." except Exception as e: logger.error(f"Dataset update error: {str(e)}") return False, f"Error updating Hugging Face dataset: {str(e)}" def process_pipeline(folder_id, naming_convention): """Main pipeline for processing images and updating dataset.""" # Validate input is_valid, error_message = validate_input(folder_id, naming_convention) if not is_valid: return error_message, [] manager = DatasetManager() # Step 1: Authenticate Google Drive auth_success, auth_message = manager.authenticate_drive() if not auth_success: return auth_message, [] # Step 2: Download and rename files success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention) if not success: return message, [] # Step 3: Update Hugging Face dataset success, hf_message = manager.update_huggingface_dataset(renamed_files) return f"{message}\n{hf_message}", renamed_files def process_ui(folder_id, naming_convention): """UI handler for the process pipeline""" status, renamed_files = process_pipeline(folder_id, naming_convention) table_data = [[file['original_name'], file['new_name'], file['file_path']] for file in renamed_files] if renamed_files else [] return status, table_data # Custom CSS for web-safe fonts and improved styling custom_css = """ div.gradio-container { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif !important; } div.gradio-container button { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif !important; } div.gradio-container input { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif !important; } .gr-form { background-color: #ffffff; border-radius: 8px; padding: 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); } .gr-button { background-color: #2c5282; color: white; } .gr-button:hover { background-color: #2b6cb0; } .gr-input { border: 1px solid #e2e8f0; } .gr-input:focus { border-color: #4299e1; box-shadow: 0 0 0 1px #4299e1; } """ # Gradio interface demo = gr.Interface( fn=process_ui, inputs=[ gr.Textbox( label="Google Drive Folder ID", placeholder="Enter the folder ID from the URL", info="Found in your Google Drive folder's URL" ), gr.Textbox( label="Naming Convention", placeholder="e.g., sports_card", value="sports_card", info="Use only letters, numbers, and underscores" ) ], outputs=[ gr.Textbox( label="Status", lines=3 ), gr.Dataframe( headers=["Original Name", "New Name", "File Path"], wrap=True ) ], title="Sports Cards Dataset Processor", description=""" Instructions: 1. Enter the Google Drive folder ID (found in the folder's URL) 2. Specify a naming convention for the files (e.g., 'sports_card') 3. Click submit to start processing Note: Only image files will be processed. Invalid images will be skipped. """, css=custom_css, theme="default" ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860 )