awacke1 commited on
Commit
4bf9d79
·
verified ·
1 Parent(s): a4c2226

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +417 -0
app.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import httpx
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from huggingface_hub import HfApi, ModelCard, snapshot_download, login
6
+ import base64
7
+ import io
8
+ import zipfile
9
+ import asyncio
10
+ import aiohttp
11
+ from pathlib import Path
12
+ import emoji
13
+ import tempfile
14
+ import shutil
15
+ import os
16
+
17
+ # Search Terms
18
+ example_search_terms = [
19
+ {"id": "gpt-3", "emoji": "🤖"},
20
+ {"id": "stable-diffusion", "emoji": "🎨"},
21
+ {"id": "whisper", "emoji": "🗣️"},
22
+ {"id": "bert", "emoji": "📖"},
23
+ {"id": "resnet", "emoji": "🖼️"}
24
+ ]
25
+
26
+
27
+ # Initialize HuggingFace with access token
28
+ def init_huggingface(token: str):
29
+ """Initialize HuggingFace with access token."""
30
+ try:
31
+ login(token=token)
32
+ return True
33
+ except Exception as e:
34
+ print(f"Error logging in: {str(e)}")
35
+ return False
36
+
37
+ def format_link(item: Dict, number: int, search_type: str) -> str:
38
+ """Format a link for display in the UI."""
39
+ link = item['link']
40
+ readme_link = f"{link}/blob/main/README.md"
41
+ title = f"{number}. {item['id']}"
42
+
43
+ metadata = f"Author: {item['author']}"
44
+ if 'downloads' in item:
45
+ metadata += f", Downloads: {item['downloads']}"
46
+
47
+ html = f"""
48
+ <div style="margin-bottom: 10px;">
49
+ <strong>{title}</strong><br>
50
+ <a href="{link}" target="_blank" style="color: #4a90e2; text-decoration: none;">View {search_type[:-1]}</a> |
51
+ <a href="{readme_link}" target="_blank" style="color: #4a90e2; text-decoration: none;">View README</a><br>
52
+ <small>{metadata}</small>
53
+ </div>
54
+ """
55
+ return html
56
+
57
+ def display_results(df: pd.DataFrame):
58
+ """Display search results in HTML format."""
59
+ if df is not None and not df.empty:
60
+ html = "<div style='max-height: 400px; overflow-y: auto;'>"
61
+ for _, row in df.iterrows():
62
+ html += row['formatted_link']
63
+ html += "</div>"
64
+ return html
65
+ else:
66
+ return "<p>No results found.</p>"
67
+
68
+ def SwarmyTime(data: List[Dict]) -> Dict:
69
+ """Aggregates all content from the given data."""
70
+ aggregated = {
71
+ "total_items": len(data),
72
+ "unique_authors": set(),
73
+ "total_downloads": 0,
74
+ "item_types": {"Models": 0, "Datasets": 0, "Spaces": 0}
75
+ }
76
+
77
+ for item in data:
78
+ aggregated["unique_authors"].add(item.get("author", "Unknown"))
79
+ aggregated["total_downloads"] += item.get("downloads", 0)
80
+
81
+ if "modelId" in item:
82
+ aggregated["item_types"]["Models"] += 1
83
+ elif "dataset" in item.get("id", ""):
84
+ aggregated["item_types"]["Datasets"] += 1
85
+ else:
86
+ aggregated["item_types"]["Spaces"] += 1
87
+
88
+ aggregated["unique_authors"] = len(aggregated["unique_authors"])
89
+ return aggregated
90
+
91
+ def search_and_aggregate(query, search_type, token, example_term):
92
+ if example_term:
93
+ query = example_term.split(" ")[1] # Extract the user ID from the button label
94
+ df = search_hub(query, search_type, token)
95
+ data = df.to_dict('records')
96
+ aggregated = SwarmyTime(data)
97
+ html_results = display_results(df)
98
+ return [
99
+ html_results,
100
+ "Status: Ready to download",
101
+ "",
102
+ aggregated,
103
+ search_type,
104
+ data
105
+ ]
106
+
107
+
108
+
109
+ def search_hub(query: str, search_type: str, token: str = None) -> pd.DataFrame:
110
+ """Search the Hugging Face Hub for models, datasets, or spaces."""
111
+ api = HfApi(token=token)
112
+ if search_type == "Models":
113
+ results = api.list_models(search=query)
114
+ data = [{"id": model.modelId, "author": model.author, "downloads": model.downloads, "link": f"https://huggingface.co/{model.modelId}"} for model in results]
115
+ elif search_type == "Datasets":
116
+ results = api.list_datasets(search=query)
117
+ data = [{"id": dataset.id, "author": dataset.author, "downloads": dataset.downloads, "link": f"https://huggingface.co/datasets/{dataset.id}"} for dataset in results]
118
+ elif search_type == "Spaces":
119
+ results = api.list_spaces(search=query)
120
+ data = [{"id": space.id, "author": space.author, "link": f"https://huggingface.co/spaces/{space.id}"} for space in results]
121
+ else:
122
+ data = []
123
+
124
+ for i, item in enumerate(data, 1):
125
+ item['number'] = i
126
+ item['formatted_link'] = format_link(item, i, search_type)
127
+
128
+ return pd.DataFrame(data)
129
+
130
+ async def download_readme(session: aiohttp.ClientSession, item: Dict, token: str) -> tuple[str, str]:
131
+ """Download README.md file for a given item."""
132
+ item_id = item['id']
133
+
134
+ # Different base URLs for different repository types
135
+ if 'datasets' in item['link']:
136
+ raw_url = f"https://huggingface.co/datasets/{item_id}/raw/main/README.md"
137
+ alt_url = f"https://huggingface.co/datasets/{item_id}/raw/master/README.md"
138
+ elif 'spaces' in item['link']:
139
+ raw_url = f"https://huggingface.co/spaces/{item_id}/raw/main/README.md"
140
+ alt_url = f"https://huggingface.co/spaces/{item_id}/raw/master/README.md"
141
+ else: # Models
142
+ raw_url = f"https://huggingface.co/{item_id}/raw/main/README.md"
143
+ alt_url = f"https://huggingface.co/{item_id}/raw/master/README.md"
144
+
145
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
146
+
147
+ try:
148
+ # Try main branch first
149
+ async with session.get(raw_url, headers=headers) as response:
150
+ if response.status == 200:
151
+ content = await response.text()
152
+ return item_id.replace('/', '_'), content
153
+
154
+ # If main branch fails, try master branch
155
+ if response.status in [401, 404]:
156
+ async with session.get(alt_url, headers=headers) as alt_response:
157
+ if alt_response.status == 200:
158
+ content = await alt_response.text()
159
+ return item_id.replace('/', '_'), content
160
+
161
+ # If both attempts fail, return error message
162
+ error_msg = f"# Error downloading README for {item_id}\n"
163
+ if response.status == 401:
164
+ error_msg += "Authentication required. Please provide a valid HuggingFace token."
165
+ else:
166
+ error_msg += f"Status code: {response.status}"
167
+ return item_id.replace('/', '_'), error_msg
168
+
169
+ except Exception as e:
170
+ return item_id.replace('/', '_'), f"# Error downloading README for {item_id}\nError: {str(e)}"
171
+
172
+ async def download_all_readmes(data: List[Dict], token: str) -> tuple[str, str]:
173
+ """Download all README files and create a zip archive."""
174
+ if not data:
175
+ return "", "No results to download"
176
+
177
+ zip_buffer = io.BytesIO()
178
+ status_message = "Downloading READMEs..."
179
+ failed_downloads = []
180
+
181
+ async with aiohttp.ClientSession() as session:
182
+ tasks = [download_readme(session, item, token) for item in data]
183
+ results = await asyncio.gather(*tasks)
184
+
185
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
186
+ for filename, content in results:
187
+ if "Error downloading README" in content:
188
+ failed_downloads.append(filename)
189
+ zip_file.writestr(f"{filename}.md", content)
190
+
191
+ zip_buffer.seek(0)
192
+ base64_zip = base64.b64encode(zip_buffer.getvalue()).decode()
193
+
194
+ status = "READMEs ready for download!"
195
+ if failed_downloads:
196
+ status += f" (Failed to download {len(failed_downloads)} READMEs)"
197
+
198
+ download_link = f"""
199
+ <div style="margin-top: 10px;">
200
+ <a href="data:application/zip;base64,{base64_zip}"
201
+ download="readmes.zip"
202
+ style="display: inline-block; padding: 10px 20px;
203
+ background-color: #4CAF50; color: white;
204
+ text-decoration: none; border-radius: 5px;">
205
+ 📥 Download READMEs Archive
206
+ </a>
207
+ {f'<p style="color: #ff6b6b; margin-top: 10px;">Note: Some READMEs could not be downloaded. Please check the zip file for details.</p>' if failed_downloads else ''}
208
+ </div>
209
+ """
210
+
211
+ return download_link, status
212
+
213
+ def download_repository(repo_id: str, repo_type: str, temp_dir: str, token: str) -> str:
214
+ """Download a single repository."""
215
+ try:
216
+ repo_path = snapshot_download(
217
+ repo_id=repo_id,
218
+ repo_type=repo_type.lower()[:-1], # Remove 's' from 'Models'/'Datasets'/'Spaces'
219
+ local_dir=os.path.join(temp_dir, repo_id.replace('/', '_')),
220
+ ignore_patterns=["*.bin", "*.pt", "*.pth", "*.ckpt", "*.safetensors"], # Ignore large binary files
221
+ token=token
222
+ )
223
+ return repo_path
224
+ except Exception as e:
225
+ print(f"Error downloading {repo_id}: {str(e)}")
226
+ return None
227
+
228
+ def create_repo_zip(data: List[Dict], search_type: str, token: str) -> tuple[str, str]:
229
+ """Download repositories and create a zip archive."""
230
+ if not data:
231
+ return "", "No repositories to download"
232
+
233
+ # Create temporary directory
234
+ with tempfile.TemporaryDirectory() as temp_dir:
235
+ successful_downloads = []
236
+
237
+ # Download each repository
238
+ for item in data:
239
+ repo_path = download_repository(item['id'], search_type, temp_dir, token)
240
+ if repo_path:
241
+ successful_downloads.append(repo_path)
242
+
243
+ if not successful_downloads:
244
+ return "", "No repositories were successfully downloaded"
245
+
246
+ # Create zip file in memory
247
+ zip_buffer = io.BytesIO()
248
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
249
+ for repo_path in successful_downloads:
250
+ repo_name = os.path.basename(repo_path)
251
+ for root, _, files in os.walk(repo_path):
252
+ for file in files:
253
+ file_path = os.path.join(root, file)
254
+ arcname = os.path.join(repo_name, os.path.relpath(file_path, repo_path))
255
+ zip_file.write(file_path, arcname)
256
+
257
+ # Convert to base64
258
+ zip_buffer.seek(0)
259
+ base64_zip = base64.b64encode(zip_buffer.getvalue()).decode()
260
+
261
+ download_link = f"""
262
+ <div style="margin-top: 10px;">
263
+ <a href="data:application/zip;base64,{base64_zip}"
264
+ download="repositories.zip"
265
+ style="display: inline-block; padding: 10px 20px;
266
+ background-color: #4CAF50; color: white;
267
+ text-decoration: none; border-radius: 5px;">
268
+ 📥 Download Repositories Archive
269
+ </a>
270
+ </div>
271
+ """
272
+
273
+ return download_link, f"Successfully downloaded {len(successful_downloads)} repositories"
274
+
275
+ # Gradio Interface
276
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
277
+ gr.Markdown("""
278
+ # Search the Hugging Face Hub
279
+ Search and download models, datasets, and spaces from Hugging Face.
280
+ """)
281
+
282
+ with gr.Row():
283
+ example_search_buttons = gr.ButtonGroup(
284
+ choices=[f"{term['emoji']} {term['id']}" for term in example_search_terms],
285
+ label="Example Search Terms",
286
+ value=None
287
+ )
288
+
289
+ with gr.Row():
290
+ with gr.Column(scale=3):
291
+ hf_token = gr.Textbox(
292
+ label="HuggingFace Access Token (optional)",
293
+ type="password",
294
+ placeholder="Enter your HuggingFace access token...",
295
+ )
296
+
297
+ with gr.Row():
298
+ with gr.Column(scale=3):
299
+ search_query = gr.Textbox(
300
+ label="Search Query",
301
+ value="awacke1",
302
+ placeholder="Enter search term..."
303
+ )
304
+ with gr.Column(scale=2):
305
+ search_type = gr.Radio(
306
+ ["Models", "Datasets", "Spaces"],
307
+ label="Search Type",
308
+ value="Models",
309
+ container=True
310
+ )
311
+ with gr.Column(scale=1):
312
+ search_button = gr.Button("🔍 Search", variant="primary", scale=1)
313
+
314
+ with gr.Row(variant="panel"):
315
+ with gr.Column(scale=1):
316
+ gr.Markdown("### Download Options")
317
+ with gr.Row():
318
+ download_readme_button = gr.Button(
319
+ "📚 Download READMEs",
320
+ variant="secondary",
321
+ )
322
+ download_repo_button = gr.Button(
323
+ "📦 Download Repositories",
324
+ variant="secondary",
325
+ )
326
+ download_status = gr.Markdown("Status: Ready to download", label="Status")
327
+ download_area = gr.HTML("", label="Download Link")
328
+
329
+ with gr.Row():
330
+ with gr.Column(scale=2):
331
+ results_html = gr.HTML(label="Search Results")
332
+ with gr.Column(scale=1):
333
+ aggregated_output = gr.JSON(label="Search Statistics")
334
+
335
+ search_type_state = gr.State("")
336
+ current_results = gr.State([])
337
+
338
+ def search_and_aggregate(query, search_type, token):
339
+ df = search_hub(query, search_type, token)
340
+ data = df.to_dict('records')
341
+ aggregated = SwarmyTime(data)
342
+ html_results = display_results(df)
343
+ return [
344
+ html_results,
345
+ "Status: Ready to download",
346
+ "",
347
+ aggregated,
348
+ search_type,
349
+ data
350
+ ]
351
+ async def handle_readme_download(data, token):
352
+ if data:
353
+ download_link, status = await download_all_readmes(data, token)
354
+ return [f"Status: {status}", download_link]
355
+ else:
356
+ return ["Status: No results to download", ""]
357
+
358
+ def handle_repo_download(data, search_type, token):
359
+ if data:
360
+ download_link, status = create_repo_zip(data, search_type, token)
361
+ return [f"Status: {status}", download_link]
362
+ else:
363
+ return ["Status: No results to download", ""]
364
+
365
+ #async def handle_readme_download(data, token):
366
+ # if not data:
367
+ # return ["Status: No results to download", ""]
368
+ # download_link, status = await download_all_readmes(data, token)
369
+ # return [f"Status: {status}", download_link]
370
+
371
+ #def handle_repo_download(data, search_type, token):
372
+ # if not data:
373
+ # return ["Status: No results to download", ""]
374
+ # download_link, status = create_repo_zip(data, search_type, token)
375
+ # return [f"Status: {status}", download_link]
376
+
377
+ #search_button.click(
378
+ # search_and_aggregate,
379
+ # inputs=[search_query, search_type, hf_token],
380
+ # outputs=[
381
+ # results_html,
382
+ # download_status,
383
+ # download_area,
384
+ # aggregated_output,
385
+ # search_type_state,
386
+ # current_results
387
+ # ]
388
+ #)
389
+
390
+
391
+ search_button.click(
392
+ search_and_aggregate,
393
+ inputs=[search_query, search_type, hf_token, example_search_buttons],
394
+ outputs=[
395
+ results_html,
396
+ download_status,
397
+ download_area,
398
+ aggregated_output,
399
+ search_type_state,
400
+ current_results
401
+ ],
402
+ call_after=lambda data: [handle_readme_download(data[-1], hf_token.value), handle_repo_download(data[-1], search_type.value, hf_token.value)]
403
+ )
404
+
405
+ download_readme_button.click(
406
+ handle_readme_download,
407
+ inputs=[current_results, hf_token],
408
+ outputs=[download_status, download_area]
409
+ )
410
+
411
+ download_repo_button.click(
412
+ handle_repo_download,
413
+ inputs=[current_results, search_type_state, hf_token],
414
+ outputs=[download_status, download_area]
415
+ )
416
+
417
+ demo.launch(debug=True)