awacke1 commited on
Commit
2fb4bb5
·
verified ·
1 Parent(s): 93ce1dd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +265 -0
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict
2
+ import httpx
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from huggingface_hub import HfApi, ModelCard, snapshot_download, login
6
+ import base64
7
+ import io
8
+ import zipfile
9
+ import asyncio
10
+ import aiohttp
11
+ from pathlib import Path
12
+ import emoji
13
+ import tempfile
14
+ import shutil
15
+ import os
16
+
17
+ # Add a login function to handle HuggingFace authentication
18
+ def init_huggingface(token: str):
19
+ """Initialize HuggingFace with access token."""
20
+ try:
21
+ login(token=token)
22
+ return True
23
+ except Exception as e:
24
+ print(f"Error logging in: {str(e)}")
25
+ return False
26
+
27
+ def search_hub(query: str, search_type: str, token: str = None) -> pd.DataFrame:
28
+ api = HfApi(token=token)
29
+ if search_type == "Models":
30
+ results = api.list_models(search=query)
31
+ data = [{"id": model.modelId, "author": model.author, "downloads": model.downloads, "link": f"https://huggingface.co/{model.modelId}"} for model in results]
32
+ elif search_type == "Datasets":
33
+ results = api.list_datasets(search=query)
34
+ data = [{"id": dataset.id, "author": dataset.author, "downloads": dataset.downloads, "link": f"https://huggingface.co/datasets/{dataset.id}"} for dataset in results]
35
+ elif search_type == "Spaces":
36
+ results = api.list_spaces(search=query)
37
+ data = [{"id": space.id, "author": space.author, "link": f"https://huggingface.co/spaces/{space.id}"} for space in results]
38
+ else:
39
+ data = []
40
+
41
+ for i, item in enumerate(data, 1):
42
+ item['number'] = i
43
+ item['formatted_link'] = format_link(item, i, search_type)
44
+
45
+ return pd.DataFrame(data)
46
+
47
+ async def download_readme(session: aiohttp.ClientSession, item: Dict, token: str) -> tuple[str, str]:
48
+ """Download README.md file for a given item."""
49
+ item_id = item['id']
50
+ raw_url = f"https://huggingface.co/{item_id}/raw/main/README.md"
51
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
52
+
53
+ try:
54
+ async with session.get(raw_url, headers=headers) as response:
55
+ if response.status == 200:
56
+ content = await response.text()
57
+ return item_id.replace('/', '_'), content
58
+ return item_id.replace('/', '_'), f"# Error downloading README for {item_id}\nStatus code: {response.status}"
59
+ except Exception as e:
60
+ return item_id.replace('/', '_'), f"# Error downloading README for {item_id}\nError: {str(e)}"
61
+
62
+ async def download_all_readmes(data: List[Dict], token: str) -> tuple[str, str]:
63
+ """Download all README files and create a zip archive."""
64
+ if not data:
65
+ return "", "No results to download"
66
+
67
+ zip_buffer = io.BytesIO()
68
+ status_message = "Downloading READMEs..."
69
+
70
+ async with aiohttp.ClientSession() as session:
71
+ tasks = [download_readme(session, item, token) for item in data]
72
+ results = await asyncio.gather(*tasks)
73
+
74
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
75
+ for filename, content in results:
76
+ zip_file.writestr(f"{filename}.md", content)
77
+
78
+ zip_buffer.seek(0)
79
+ base64_zip = base64.b64encode(zip_buffer.getvalue()).decode()
80
+
81
+ download_link = f"""
82
+ <div style="margin-top: 10px;">
83
+ <a href="data:application/zip;base64,{base64_zip}"
84
+ download="readmes.zip"
85
+ style="display: inline-block; padding: 10px 20px;
86
+ background-color: #4CAF50; color: white;
87
+ text-decoration: none; border-radius: 5px;">
88
+ 📥 Download READMEs Archive
89
+ </a>
90
+ </div>
91
+ """
92
+
93
+ return download_link, "READMEs ready for download!"
94
+
95
+ def download_repository(repo_id: str, repo_type: str, temp_dir: str, token: str) -> str:
96
+ """Download a single repository."""
97
+ try:
98
+ repo_path = snapshot_download(
99
+ repo_id=repo_id,
100
+ repo_type=repo_type.lower()[:-1], # Remove 's' from 'Models'/'Datasets'/'Spaces'
101
+ local_dir=os.path.join(temp_dir, repo_id.replace('/', '_')),
102
+ ignore_patterns=["*.bin", "*.pt", "*.pth", "*.ckpt", "*.safetensors"], # Ignore large binary files
103
+ token=token
104
+ )
105
+ return repo_path
106
+ except Exception as e:
107
+ print(f"Error downloading {repo_id}: {str(e)}")
108
+ return None
109
+
110
+ def create_repo_zip(data: List[Dict], search_type: str, token: str) -> tuple[str, str]:
111
+ """Download repositories and create a zip archive."""
112
+ if not data:
113
+ return "", "No repositories to download"
114
+
115
+ # Create temporary directory
116
+ with tempfile.TemporaryDirectory() as temp_dir:
117
+ successful_downloads = []
118
+
119
+ # Download each repository
120
+ for item in data:
121
+ repo_path = download_repository(item['id'], search_type, temp_dir, token)
122
+ if repo_path:
123
+ successful_downloads.append(repo_path)
124
+
125
+ if not successful_downloads:
126
+ return "", "No repositories were successfully downloaded"
127
+
128
+ # Create zip file in memory
129
+ zip_buffer = io.BytesIO()
130
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
131
+ for repo_path in successful_downloads:
132
+ repo_name = os.path.basename(repo_path)
133
+ for root, _, files in os.walk(repo_path):
134
+ for file in files:
135
+ file_path = os.path.join(root, file)
136
+ arcname = os.path.join(repo_name, os.path.relpath(file_path, repo_path))
137
+ zip_file.write(file_path, arcname)
138
+
139
+ # Convert to base64
140
+ zip_buffer.seek(0)
141
+ base64_zip = base64.b64encode(zip_buffer.getvalue()).decode()
142
+
143
+ download_link = f"""
144
+ <div style="margin-top: 10px;">
145
+ <a href="data:application/zip;base64,{base64_zip}"
146
+ download="repositories.zip"
147
+ style="display: inline-block; padding: 10px 20px;
148
+ background-color: #4CAF50; color: white;
149
+ text-decoration: none; border-radius: 5px;">
150
+ 📥 Download Repositories Archive
151
+ </a>
152
+ </div>
153
+ """
154
+
155
+ return download_link, f"Successfully downloaded {len(successful_downloads)} repositories"
156
+
157
+ # Rest of the functions remain the same (display_results, SwarmyTime)...
158
+
159
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
160
+ gr.Markdown("""
161
+ # Search the Hugging Face Hub
162
+ Search and download models, datasets, and spaces from Hugging Face.
163
+ """)
164
+
165
+ with gr.Row():
166
+ with gr.Column(scale=3):
167
+ hf_token = gr.Textbox(
168
+ label="HuggingFace Access Token (optional)",
169
+ type="password",
170
+ placeholder="Enter your HuggingFace access token...",
171
+ )
172
+
173
+ with gr.Row():
174
+ with gr.Column(scale=3):
175
+ search_query = gr.Textbox(
176
+ label="Search Query",
177
+ value="awacke1",
178
+ placeholder="Enter search term..."
179
+ )
180
+ with gr.Column(scale=2):
181
+ search_type = gr.Radio(
182
+ ["Models", "Datasets", "Spaces"],
183
+ label="Search Type",
184
+ value="Models",
185
+ container=True
186
+ )
187
+ with gr.Column(scale=1):
188
+ search_button = gr.Button("🔍 Search", variant="primary", scale=1)
189
+
190
+ with gr.Row(variant="panel"):
191
+ with gr.Column(scale=1):
192
+ gr.Markdown("### Download Options")
193
+ with gr.Row():
194
+ download_readme_button = gr.Button(
195
+ "📚 Download READMEs",
196
+ variant="secondary",
197
+ )
198
+ download_repo_button = gr.Button(
199
+ "📦 Download Repositories",
200
+ variant="secondary",
201
+ )
202
+ download_status = gr.Markdown("Status: Ready to download", label="Status")
203
+ download_area = gr.HTML("", label="Download Link")
204
+
205
+ with gr.Row():
206
+ with gr.Column(scale=2):
207
+ results_html = gr.HTML(label="Search Results")
208
+ with gr.Column(scale=1):
209
+ aggregated_output = gr.JSON(label="Search Statistics")
210
+
211
+ search_type_state = gr.State("")
212
+ current_results = gr.State([])
213
+
214
+ def search_and_aggregate(query, search_type, token):
215
+ df = search_hub(query, search_type, token)
216
+ data = df.to_dict('records')
217
+ aggregated = SwarmyTime(data)
218
+ html_results = display_results(df)
219
+ return [
220
+ html_results, # results_html
221
+ "Status: Ready to download", # download_status
222
+ "", # download_area
223
+ aggregated, # aggregated_output
224
+ search_type, # search_type_state
225
+ data # current_results
226
+ ]
227
+
228
+ async def handle_readme_download(data, token):
229
+ if not data:
230
+ return ["Status: No results to download", ""]
231
+ download_link, status = await download_all_readmes(data, token)
232
+ return [f"Status: {status}", download_link]
233
+
234
+ def handle_repo_download(data, search_type, token):
235
+ if not data:
236
+ return ["Status: No results to download", ""]
237
+ download_link, status = create_repo_zip(data, search_type, token)
238
+ return [f"Status: {status}", download_link]
239
+
240
+ search_button.click(
241
+ search_and_aggregate,
242
+ inputs=[search_query, search_type, hf_token],
243
+ outputs=[
244
+ results_html,
245
+ download_status,
246
+ download_area,
247
+ aggregated_output,
248
+ search_type_state,
249
+ current_results
250
+ ]
251
+ )
252
+
253
+ download_readme_button.click(
254
+ handle_readme_download,
255
+ inputs=[current_results, hf_token],
256
+ outputs=[download_status, download_area]
257
+ )
258
+
259
+ download_repo_button.click(
260
+ handle_repo_download,
261
+ inputs=[current_results, search_type_state, hf_token],
262
+ outputs=[download_status, download_area]
263
+ )
264
+
265
+ demo.launch(debug=True)