GotThatData commited on
Commit
23f92f3
·
verified ·
1 Parent(s): 71ac033
Files changed (1) hide show
  1. app.py +135 -3
app.py CHANGED
@@ -23,13 +23,145 @@ if not os.path.exists("settings.yaml"):
23
  with open('settings.yaml', 'r') as file:
24
  settings = yaml.safe_load(file)
25
 
26
- [... keep all the utility functions and DatasetManager class the same ...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def process_pipeline(folder_id, naming_convention):
29
  """Main pipeline for processing images and updating dataset."""
30
  # Validate input
31
- if not folder_id or not naming_convention:
32
- return "Please provide both folder ID and naming convention", []
 
33
 
34
  manager = DatasetManager()
35
 
 
23
  with open('settings.yaml', 'r') as file:
24
  settings = yaml.safe_load(file)
25
 
26
+ # Utility Functions
27
+ def safe_load_dataset(dataset_name):
28
+ """Load Hugging Face dataset safely."""
29
+ try:
30
+ dataset = load_dataset(dataset_name)
31
+ return dataset, len(dataset['train']) if 'train' in dataset else 0
32
+ except Exception as e:
33
+ logger.info(f"No existing dataset found. Starting fresh. Error: {str(e)}")
34
+ return None, 0
35
+
36
+ def is_valid_image(file_path):
37
+ """Check if a file is a valid image."""
38
+ try:
39
+ with Image.open(file_path) as img:
40
+ img.verify()
41
+ return True
42
+ except Exception as e:
43
+ logger.error(f"Invalid image: {file_path}. Error: {str(e)}")
44
+ return False
45
+
46
+ def validate_input(folder_id, naming_convention):
47
+ """Validate user input."""
48
+ if not folder_id or not folder_id.strip():
49
+ return False, "Folder ID cannot be empty"
50
+ if not naming_convention or not naming_convention.strip():
51
+ return False, "Naming convention cannot be empty"
52
+ if not naming_convention.replace('_', '').isalnum():
53
+ return False, "Naming convention should only contain letters, numbers, and underscores"
54
+ return True, ""
55
+
56
+ # DatasetManager Class
57
+ class DatasetManager:
58
+ def __init__(self, local_images_dir="downloaded_cards"):
59
+ self.local_images_dir = local_images_dir
60
+ self.drive = None
61
+ self.dataset_name = "GotThatData/sports-cards"
62
+ os.makedirs(local_images_dir, exist_ok=True)
63
+
64
+ def authenticate_drive(self):
65
+ """Authenticate with Google Drive."""
66
+ try:
67
+ gauth = GoogleAuth()
68
+ gauth.settings['client_config_file'] = settings['client_secrets_file']
69
+
70
+ # Try to load saved credentials
71
+ gauth.LoadCredentialsFile("credentials.txt")
72
+ if gauth.credentials is None:
73
+ gauth.LocalWebserverAuth()
74
+ elif gauth.access_token_expired:
75
+ gauth.Refresh()
76
+ else:
77
+ gauth.Authorize()
78
+ gauth.SaveCredentialsFile("credentials.txt")
79
+
80
+ self.drive = GoogleDrive(gauth)
81
+ return True, "Successfully authenticated with Google Drive"
82
+ except Exception as e:
83
+ logger.error(f"Authentication failed: {str(e)}")
84
+ return False, f"Authentication failed: {str(e)}"
85
+
86
+ def download_and_rename_files(self, drive_folder_id, naming_convention):
87
+ """Download files from Google Drive and rename them."""
88
+ if not self.drive:
89
+ return False, "Google Drive not authenticated", []
90
+
91
+ try:
92
+ query = f"'{drive_folder_id}' in parents and trashed=false"
93
+ file_list = self.drive.ListFile({'q': query}).GetList()
94
+
95
+ if not file_list:
96
+ logger.warning(f"No files found in folder: {drive_folder_id}")
97
+ return False, "No files found in the specified folder.", []
98
+
99
+ existing_dataset, start_index = safe_load_dataset(self.dataset_name)
100
+ renamed_files = []
101
+ processed_count = 0
102
+ error_count = 0
103
+
104
+ for i, file in enumerate(tqdm(file_list, desc="Downloading files", unit="file")):
105
+ if 'mimeType' in file and 'image' in file['mimeType'].lower():
106
+ new_filename = f"{naming_convention}_{start_index + processed_count + 1}.jpg"
107
+ file_path = os.path.join(self.local_images_dir, new_filename)
108
+
109
+ try:
110
+ file.GetContentFile(file_path)
111
+ if is_valid_image(file_path):
112
+ renamed_files.append({
113
+ 'file_path': file_path,
114
+ 'original_name': file['title'],
115
+ 'new_name': new_filename
116
+ })
117
+ processed_count += 1
118
+ logger.info(f"Successfully processed: {file['title']} -> {new_filename}")
119
+ else:
120
+ error_count += 1
121
+ if os.path.exists(file_path):
122
+ os.remove(file_path)
123
+ except Exception as e:
124
+ error_count += 1
125
+ logger.error(f"Error processing file {file['title']}: {str(e)}")
126
+ if os.path.exists(file_path):
127
+ os.remove(file_path)
128
+
129
+ status_message = f"Processed {processed_count} images successfully"
130
+ if error_count > 0:
131
+ status_message += f" ({error_count} files failed)"
132
+
133
+ return True, status_message, renamed_files
134
+ except Exception as e:
135
+ logger.error(f"Download error: {str(e)}")
136
+ return False, f"Error during download: {str(e)}", []
137
+
138
+ def update_huggingface_dataset(self, renamed_files):
139
+ """Update Hugging Face dataset with new images."""
140
+ if not renamed_files:
141
+ return False, "No files to update"
142
+
143
+ try:
144
+ df = pd.DataFrame(renamed_files)
145
+ new_dataset = Dataset.from_pandas(df)
146
+
147
+ existing_dataset, _ = safe_load_dataset(self.dataset_name)
148
+ if existing_dataset and 'train' in existing_dataset:
149
+ combined_dataset = concatenate_datasets([existing_dataset['train'], new_dataset])
150
+ else:
151
+ combined_dataset = new_dataset
152
+
153
+ combined_dataset.push_to_hub(self.dataset_name, split="train")
154
+ return True, f"Successfully updated dataset '{self.dataset_name}' with {len(renamed_files)} new images."
155
+ except Exception as e:
156
+ logger.error(f"Dataset update error: {str(e)}")
157
+ return False, f"Error updating Hugging Face dataset: {str(e)}"
158
 
159
  def process_pipeline(folder_id, naming_convention):
160
  """Main pipeline for processing images and updating dataset."""
161
  # Validate input
162
+ is_valid, error_message = validate_input(folder_id, naming_convention)
163
+ if not is_valid:
164
+ return error_message, []
165
 
166
  manager = DatasetManager()
167