Spaces:

GotThatData
/

ai-card-hub

Sleeping

App Files Files Community

GotThatData commited on Dec 17, 2024

Commit

23f92f3

verified ·

1 Parent(s): 71ac033

Update

Browse files

Files changed (1) hide show

app.py +135 -3

app.py CHANGED Viewed

@@ -23,13 +23,145 @@ if not os.path.exists("settings.yaml"):
 with open('settings.yaml', 'r') as file:
     settings = yaml.safe_load(file)
-[... keep all the utility functions and DatasetManager class the same ...]
 def process_pipeline(folder_id, naming_convention):
     """Main pipeline for processing images and updating dataset."""
     # Validate input
-    if not folder_id or not naming_convention:
-        return "Please provide both folder ID and naming convention", []
     manager = DatasetManager()

 with open('settings.yaml', 'r') as file:
     settings = yaml.safe_load(file)
+# Utility Functions
+def safe_load_dataset(dataset_name):
+    """Load Hugging Face dataset safely."""
+    try:
+        dataset = load_dataset(dataset_name)
+        return dataset, len(dataset['train']) if 'train' in dataset else 0
+    except Exception as e:
+        logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}")
+        return None, 0
+def is_valid_image(file_path):
+    """Check if a file is a valid image."""
+    try:
+        with Image.open(file_path) as img:
+            img.verify()
+            return True
+    except Exception as e:
+        logger.error(f"Invalid image: {file_path}. Error: {str(e)}")
+        return False
+def validate_input(folder_id, naming_convention):
+    """Validate user input."""
+    if not folder_id or not folder_id.strip():
+        return False, "Folder ID cannot be empty"
+    if not naming_convention or not naming_convention.strip():
+        return False, "Naming convention cannot be empty"
+    if not naming_convention.replace('_', '').isalnum():
+        return False, "Naming convention should only contain letters, numbers, and underscores"
+    return True, ""
+# DatasetManager Class
+class DatasetManager:
+    def __init__(self, local_images_dir="downloaded_cards"):
+        self.local_images_dir = local_images_dir
+        self.drive = None
+        self.dataset_name = "GotThatData/sports-cards"
+        os.makedirs(local_images_dir, exist_ok=True)
+    def authenticate_drive(self):
+        """Authenticate with Google Drive."""
+        try:
+            gauth = GoogleAuth()
+            gauth.settings['client_config_file'] = settings['client_secrets_file']
+            # Try to load saved credentials
+            gauth.LoadCredentialsFile("credentials.txt")
+            if gauth.credentials is None:
+                gauth.LocalWebserverAuth()
+            elif gauth.access_token_expired:
+                gauth.Refresh()
+            else:
+                gauth.Authorize()
+            gauth.SaveCredentialsFile("credentials.txt")
+            self.drive = GoogleDrive(gauth)
+            return True, "Successfully authenticated with Google Drive"
+        except Exception as e:
+            logger.error(f"Authentication failed: {str(e)}")
+            return False, f"Authentication failed: {str(e)}"
+    def download_and_rename_files(self, drive_folder_id, naming_convention):
+        """Download files from Google Drive and rename them."""
+        if not self.drive:
+            return False, "Google Drive not authenticated", []
+        try:
+            query = f"'{drive_folder_id}' in parents and trashed=false"
+            file_list = self.drive.ListFile({'q': query}).GetList()
+            if not file_list:
+                logger.warning(f"No files found in folder: {drive_folder_id}")
+                return False, "No files found in the specified folder.", []
+            existing_dataset, start_index = safe_load_dataset(self.dataset_name)
+            renamed_files = []
+            processed_count = 0
+            error_count = 0
+            for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
+                if 'mimeType' in file and 'image' in file['mimeType'].lower():
+                    new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg"
+                    file_path = os.path.join(self.local_images_dir, new_filename)
+                    try:
+                        file.GetContentFile(file_path)
+                        if is_valid_image(file_path):
+                            renamed_files.append({
+                                'file_path': file_path,
+                                'original_name': file['title'],
+                                'new_name': new_filename
+                            })
+                            processed_count += 1
+                            logger.info(f"Successfully processed: {file['title']} -> {new_filename}")
+                        else:
+                            error_count += 1
+                            if os.path.exists(file_path):
+                                os.remove(file_path)
+                    except Exception as e:
+                        error_count += 1
+                        logger.error(f"Error processing file {file['title']}: {str(e)}")
+                        if os.path.exists(file_path):
+                            os.remove(file_path)
+            status_message = f"Processed {processed_count} images successfully"
+            if error_count > 0:
+                status_message += f" ({error_count} files failed)"
+            return True, status_message, renamed_files
+        except Exception as e:
+            logger.error(f"Download error: {str(e)}")
+            return False, f"Error during download: {str(e)}", []
+    def update_huggingface_dataset(self, renamed_files):
+        """Update Hugging Face dataset with new images."""
+        if not renamed_files:
+            return False, "No files to update"
+        try:
+            df = pd.DataFrame(renamed_files)
+            new_dataset = Dataset.from_pandas(df)
+            existing_dataset, _ = safe_load_dataset(self.dataset_name)
+            if existing_dataset and 'train' in existing_dataset:
+                combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
+            else:
+                combined_dataset = new_dataset
+            combined_dataset.push_to_hub(self.dataset_name, split="train")
+            return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
+        except Exception as e:
+            logger.error(f"Dataset update error: {str(e)}")
+            return False, f"Error updating Hugging Face dataset: {str(e)}"
 def process_pipeline(folder_id, naming_convention):
     """Main pipeline for processing images and updating dataset."""
     # Validate input
+    is_valid, error_message = validate_input(folder_id, naming_convention)
+    if not is_valid:
+        return error_message, []
     manager = DatasetManager()