Spaces:
Paused
Paused
throaway2854
commited on
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import gradio as gr
|
6 |
+
from datasets import Dataset
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
import uuid
|
10 |
+
import shutil
|
11 |
+
|
12 |
+
DATA_DIR = "/data"
|
13 |
+
IMAGES_DIR = os.path.join(DATA_DIR, "images")
|
14 |
+
DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
|
15 |
+
|
16 |
+
def extract_image_url(html_content):
|
17 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
18 |
+
script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
|
19 |
+
|
20 |
+
if script:
|
21 |
+
image_data = json.loads(script.string.split('=', 1)[1].strip().rstrip(';'))
|
22 |
+
return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
|
23 |
+
|
24 |
+
img_tag = soup.find('img', alt=True)
|
25 |
+
if img_tag and 'src' in img_tag.attrs:
|
26 |
+
return img_tag['src']
|
27 |
+
|
28 |
+
return None
|
29 |
+
|
30 |
+
def extract_tags(html_content):
|
31 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
32 |
+
tag_elements = soup.find_all('li', class_='tag-type-general')
|
33 |
+
|
34 |
+
tags = []
|
35 |
+
for tag_element in tag_elements:
|
36 |
+
tag_link = tag_element.find_all('a')[1]
|
37 |
+
if tag_link:
|
38 |
+
tags.append(tag_link.text)
|
39 |
+
|
40 |
+
return ','.join(tags)
|
41 |
+
|
42 |
+
def download_image(url):
|
43 |
+
try:
|
44 |
+
response = requests.get(url)
|
45 |
+
response.raise_for_status()
|
46 |
+
return Image.open(io.BytesIO(response.content))
|
47 |
+
except requests.RequestException as e:
|
48 |
+
raise Exception(f"Failed to download image: {str(e)}")
|
49 |
+
|
50 |
+
class DatasetBuilder:
|
51 |
+
def __init__(self):
|
52 |
+
self.dataset = self.load_dataset()
|
53 |
+
os.makedirs(IMAGES_DIR, exist_ok=True)
|
54 |
+
|
55 |
+
def load_dataset(self):
|
56 |
+
if os.path.exists(DATASET_FILE):
|
57 |
+
with open(DATASET_FILE, 'r') as f:
|
58 |
+
return json.load(f)
|
59 |
+
return []
|
60 |
+
|
61 |
+
def save_dataset(self):
|
62 |
+
with open(DATASET_FILE, 'w') as f:
|
63 |
+
json.dump(self.dataset, f)
|
64 |
+
|
65 |
+
def add_image(self, url):
|
66 |
+
try:
|
67 |
+
response = requests.get(url)
|
68 |
+
response.raise_for_status()
|
69 |
+
html_content = response.text
|
70 |
+
|
71 |
+
image_url = extract_image_url(html_content)
|
72 |
+
if not image_url:
|
73 |
+
raise Exception("Failed to extract image URL")
|
74 |
+
|
75 |
+
tags = extract_tags(html_content)
|
76 |
+
image = download_image(image_url)
|
77 |
+
|
78 |
+
# Generate a unique filename
|
79 |
+
filename = f"{uuid.uuid4()}.jpg"
|
80 |
+
filepath = os.path.join(IMAGES_DIR, filename)
|
81 |
+
|
82 |
+
# Save the image
|
83 |
+
image.save(filepath)
|
84 |
+
|
85 |
+
self.dataset.append({
|
86 |
+
'image': filename,
|
87 |
+
'tags': tags
|
88 |
+
})
|
89 |
+
|
90 |
+
self.save_dataset()
|
91 |
+
return f"Added image with tags: {tags}"
|
92 |
+
except Exception as e:
|
93 |
+
return f"Error: {str(e)}"
|
94 |
+
|
95 |
+
def build_huggingface_dataset(self):
|
96 |
+
if not self.dataset:
|
97 |
+
return "Dataset is empty. Add some images first."
|
98 |
+
|
99 |
+
try:
|
100 |
+
hf_dataset = Dataset.from_dict({
|
101 |
+
'image': [os.path.join(IMAGES_DIR, item['image']) for item in self.dataset],
|
102 |
+
'tags': [item['tags'] for item in self.dataset]
|
103 |
+
})
|
104 |
+
return "HuggingFace Dataset created successfully!"
|
105 |
+
except Exception as e:
|
106 |
+
return f"Error creating HuggingFace Dataset: {str(e)}"
|
107 |
+
|
108 |
+
def get_dataset_info(self):
|
109 |
+
return f"Current dataset size: {len(self.dataset)} images"
|
110 |
+
|
111 |
+
def get_dataset_preview(self, num_images=5):
|
112 |
+
preview = []
|
113 |
+
for item in self.dataset[-num_images:]:
|
114 |
+
image_path = os.path.join(IMAGES_DIR, item['image'])
|
115 |
+
preview.append((image_path, item['tags']))
|
116 |
+
return preview
|
117 |
+
|
118 |
+
dataset_builder = DatasetBuilder()
|
119 |
+
|
120 |
+
def add_image_to_dataset(url):
|
121 |
+
result = dataset_builder.add_image(url)
|
122 |
+
return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
|
123 |
+
|
124 |
+
def create_huggingface_dataset():
|
125 |
+
return dataset_builder.build_huggingface_dataset()
|
126 |
+
|
127 |
+
def view_dataset():
|
128 |
+
return dataset_builder.get_dataset_preview(num_images=20)
|
129 |
+
|
130 |
+
# Create Gradio interface
|
131 |
+
with gr.Blocks(theme="huggingface") as iface:
|
132 |
+
gr.Markdown("# Image Dataset Builder")
|
133 |
+
gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
|
134 |
+
|
135 |
+
with gr.Row():
|
136 |
+
url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
|
137 |
+
add_button = gr.Button("Add Image")
|
138 |
+
|
139 |
+
result_output = gr.Textbox(label="Result")
|
140 |
+
dataset_info = gr.Textbox(label="Dataset Info", value=dataset_builder.get_dataset_info())
|
141 |
+
|
142 |
+
gr.Markdown("## Dataset Preview")
|
143 |
+
preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery").style(grid=5, height="auto")
|
144 |
+
|
145 |
+
add_button.click(add_image_to_dataset, inputs=url_input, outputs=[result_output, dataset_info, preview_gallery])
|
146 |
+
|
147 |
+
create_hf_button = gr.Button("Create HuggingFace Dataset")
|
148 |
+
hf_result = gr.Textbox(label="Dataset Creation Result")
|
149 |
+
create_hf_button.click(create_huggingface_dataset, inputs=[], outputs=hf_result)
|
150 |
+
|
151 |
+
view_dataset_button = gr.Button("View Dataset")
|
152 |
+
dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery").style(grid=5, height="auto")
|
153 |
+
view_dataset_button.click(view_dataset, inputs=[], outputs=dataset_gallery)
|
154 |
+
|
155 |
+
# Launch the interface
|
156 |
+
iface.launch()
|