Spaces:
Paused
Paused
throaway2854
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -12,9 +12,6 @@ import random
|
|
12 |
|
13 |
DATA_DIR = "/data"
|
14 |
IMAGES_DIR = os.path.join(DATA_DIR, "images")
|
15 |
-
DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
|
16 |
-
|
17 |
-
# Add a user agent rotation list
|
18 |
USER_AGENTS = [
|
19 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
20 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
|
@@ -42,30 +39,16 @@ def make_request(url, cookies=None):
|
|
42 |
def extract_image_url(html_content):
|
43 |
soup = BeautifulSoup(html_content, 'html.parser')
|
44 |
|
45 |
-
# First, try to extract the image URL from the <script> tag
|
46 |
script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
|
47 |
-
|
48 |
if script:
|
49 |
try:
|
50 |
-
# Extract and clean the JavaScript object string
|
51 |
js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
|
52 |
-
|
53 |
-
# Log the string for debugging
|
54 |
-
print("Extracted JavaScript object string:", js_object_str)
|
55 |
-
|
56 |
-
# Replace single quotes with double quotes to make it a valid JSON string
|
57 |
js_object_str = js_object_str.replace("'", '"')
|
58 |
-
|
59 |
-
# Parse the JSON object
|
60 |
image_data = json.loads(js_object_str)
|
61 |
-
|
62 |
-
# Construct the full image URL
|
63 |
return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
|
64 |
-
|
65 |
except json.JSONDecodeError as e:
|
66 |
raise Exception(f"Failed to decode JSON: {str(e)}")
|
67 |
|
68 |
-
# If the script tag method fails, try to get the image URL from an <img alt> tag
|
69 |
img_tag = soup.find('img', alt=True)
|
70 |
if img_tag and 'src' in img_tag.attrs:
|
71 |
return img_tag['src']
|
@@ -74,19 +57,8 @@ def extract_image_url(html_content):
|
|
74 |
|
75 |
def extract_tags(html_content):
|
76 |
soup = BeautifulSoup(html_content, 'html.parser')
|
77 |
-
|
78 |
-
# Find all list items with the relevant class
|
79 |
tag_elements = soup.find_all('li', class_='tag-type-general')
|
80 |
-
|
81 |
-
tags = []
|
82 |
-
for tag_element in tag_elements:
|
83 |
-
# The second <a> tag contains the relevant tag name
|
84 |
-
tag_links = tag_element.find_all('a')
|
85 |
-
if len(tag_links) > 1:
|
86 |
-
tag_name = tag_links[1].text
|
87 |
-
tags.append(tag_name)
|
88 |
-
|
89 |
-
# Join all tags into a single string separated by commas
|
90 |
return ','.join(tags)
|
91 |
|
92 |
def download_image(url, cookies=None):
|
@@ -98,18 +70,24 @@ def download_image(url, cookies=None):
|
|
98 |
raise Exception(f"Failed to download image: {str(e)}")
|
99 |
|
100 |
class DatasetBuilder:
|
101 |
-
def __init__(self):
|
|
|
102 |
self.dataset = self.load_dataset()
|
103 |
os.makedirs(IMAGES_DIR, exist_ok=True)
|
104 |
|
|
|
|
|
|
|
105 |
def load_dataset(self):
|
106 |
-
|
107 |
-
|
|
|
108 |
return json.load(f)
|
109 |
return []
|
110 |
|
111 |
def save_dataset(self):
|
112 |
-
|
|
|
113 |
json.dump(self.dataset, f)
|
114 |
|
115 |
def add_image(self, url, cookies=None):
|
@@ -125,11 +103,9 @@ class DatasetBuilder:
|
|
125 |
tags = extract_tags(html_content)
|
126 |
image = download_image(image_url, cookies)
|
127 |
|
128 |
-
# Generate a unique filename
|
129 |
filename = f"{uuid.uuid4()}.jpg"
|
130 |
filepath = os.path.join(IMAGES_DIR, filename)
|
131 |
|
132 |
-
# Save the image
|
133 |
image.save(filepath)
|
134 |
|
135 |
self.dataset.append({
|
@@ -156,7 +132,7 @@ class DatasetBuilder:
|
|
156 |
return f"Error creating HuggingFace Dataset: {str(e)}"
|
157 |
|
158 |
def get_dataset_info(self):
|
159 |
-
return f"Current dataset size: {len(self.dataset)} images"
|
160 |
|
161 |
def get_dataset_preview(self, num_images=5):
|
162 |
preview = []
|
@@ -165,17 +141,18 @@ class DatasetBuilder:
|
|
165 |
preview.append((image_path, item['tags']))
|
166 |
return preview
|
167 |
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
result
|
172 |
-
return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
|
173 |
|
174 |
-
def create_huggingface_dataset():
|
175 |
-
|
|
|
176 |
|
177 |
-
def view_dataset():
|
178 |
-
|
|
|
179 |
|
180 |
# Create Gradio interface
|
181 |
with gr.Blocks(theme="huggingface") as iface:
|
@@ -183,25 +160,26 @@ with gr.Blocks(theme="huggingface") as iface:
|
|
183 |
gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
|
184 |
|
185 |
with gr.Row():
|
|
|
186 |
url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
|
187 |
cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
|
188 |
add_button = gr.Button("Add Image")
|
189 |
|
190 |
result_output = gr.Textbox(label="Result")
|
191 |
-
dataset_info = gr.Textbox(label="Dataset Info"
|
192 |
|
193 |
gr.Markdown("## Dataset Preview")
|
194 |
preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
|
195 |
|
196 |
-
add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input], outputs=[result_output, dataset_info, preview_gallery])
|
197 |
|
198 |
create_hf_button = gr.Button("Create HuggingFace Dataset")
|
199 |
hf_result = gr.Textbox(label="Dataset Creation Result")
|
200 |
-
create_hf_button.click(create_huggingface_dataset, inputs=[], outputs=hf_result)
|
201 |
|
202 |
view_dataset_button = gr.Button("View Dataset")
|
203 |
dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery", columns=5, rows=4, height="auto")
|
204 |
-
view_dataset_button.click(view_dataset, inputs=[], outputs=dataset_gallery)
|
205 |
|
206 |
# Launch the interface
|
207 |
-
iface.launch()
|
|
|
12 |
|
13 |
DATA_DIR = "/data"
|
14 |
IMAGES_DIR = os.path.join(DATA_DIR, "images")
|
|
|
|
|
|
|
15 |
USER_AGENTS = [
|
16 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
17 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
|
|
|
39 |
def extract_image_url(html_content):
|
40 |
soup = BeautifulSoup(html_content, 'html.parser')
|
41 |
|
|
|
42 |
script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
|
|
|
43 |
if script:
|
44 |
try:
|
|
|
45 |
js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
|
|
|
|
|
|
|
|
|
|
|
46 |
js_object_str = js_object_str.replace("'", '"')
|
|
|
|
|
47 |
image_data = json.loads(js_object_str)
|
|
|
|
|
48 |
return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
|
|
|
49 |
except json.JSONDecodeError as e:
|
50 |
raise Exception(f"Failed to decode JSON: {str(e)}")
|
51 |
|
|
|
52 |
img_tag = soup.find('img', alt=True)
|
53 |
if img_tag and 'src' in img_tag.attrs:
|
54 |
return img_tag['src']
|
|
|
57 |
|
58 |
def extract_tags(html_content):
|
59 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
60 |
tag_elements = soup.find_all('li', class_='tag-type-general')
|
61 |
+
tags = [tag_element.find_all('a')[1].text for tag_element in tag_elements if len(tag_element.find_all('a')) > 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
return ','.join(tags)
|
63 |
|
64 |
def download_image(url, cookies=None):
|
|
|
70 |
raise Exception(f"Failed to download image: {str(e)}")
|
71 |
|
72 |
class DatasetBuilder:
|
73 |
+
def __init__(self, dataset_name):
|
74 |
+
self.dataset_name = dataset_name
|
75 |
self.dataset = self.load_dataset()
|
76 |
os.makedirs(IMAGES_DIR, exist_ok=True)
|
77 |
|
78 |
+
def get_dataset_file(self):
|
79 |
+
return os.path.join(DATA_DIR, f"{self.dataset_name}.json")
|
80 |
+
|
81 |
def load_dataset(self):
|
82 |
+
dataset_file = self.get_dataset_file()
|
83 |
+
if os.path.exists(dataset_file):
|
84 |
+
with open(dataset_file, 'r') as f:
|
85 |
return json.load(f)
|
86 |
return []
|
87 |
|
88 |
def save_dataset(self):
|
89 |
+
dataset_file = self.get_dataset_file()
|
90 |
+
with open(dataset_file, 'w') as f:
|
91 |
json.dump(self.dataset, f)
|
92 |
|
93 |
def add_image(self, url, cookies=None):
|
|
|
103 |
tags = extract_tags(html_content)
|
104 |
image = download_image(image_url, cookies)
|
105 |
|
|
|
106 |
filename = f"{uuid.uuid4()}.jpg"
|
107 |
filepath = os.path.join(IMAGES_DIR, filename)
|
108 |
|
|
|
109 |
image.save(filepath)
|
110 |
|
111 |
self.dataset.append({
|
|
|
132 |
return f"Error creating HuggingFace Dataset: {str(e)}"
|
133 |
|
134 |
def get_dataset_info(self):
|
135 |
+
return f"Current dataset size ({self.dataset_name}): {len(self.dataset)} images"
|
136 |
|
137 |
def get_dataset_preview(self, num_images=5):
|
138 |
preview = []
|
|
|
141 |
preview.append((image_path, item['tags']))
|
142 |
return preview
|
143 |
|
144 |
+
def add_image_to_dataset(url, cookies, dataset_name):
|
145 |
+
builder = DatasetBuilder(dataset_name)
|
146 |
+
result = builder.add_image(url, cookies)
|
147 |
+
return result, builder.get_dataset_info(), builder.get_dataset_preview()
|
|
|
148 |
|
149 |
+
def create_huggingface_dataset(dataset_name):
|
150 |
+
builder = DatasetBuilder(dataset_name)
|
151 |
+
return builder.build_huggingface_dataset()
|
152 |
|
153 |
+
def view_dataset(dataset_name):
|
154 |
+
builder = DatasetBuilder(dataset_name)
|
155 |
+
return builder.get_dataset_preview(num_images=20)
|
156 |
|
157 |
# Create Gradio interface
|
158 |
with gr.Blocks(theme="huggingface") as iface:
|
|
|
160 |
gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
|
161 |
|
162 |
with gr.Row():
|
163 |
+
dataset_name_input = gr.Textbox(lines=1, placeholder="Enter dataset name...", value="default_dataset")
|
164 |
url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
|
165 |
cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
|
166 |
add_button = gr.Button("Add Image")
|
167 |
|
168 |
result_output = gr.Textbox(label="Result")
|
169 |
+
dataset_info = gr.Textbox(label="Dataset Info")
|
170 |
|
171 |
gr.Markdown("## Dataset Preview")
|
172 |
preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
|
173 |
|
174 |
+
add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input, dataset_name_input], outputs=[result_output, dataset_info, preview_gallery])
|
175 |
|
176 |
create_hf_button = gr.Button("Create HuggingFace Dataset")
|
177 |
hf_result = gr.Textbox(label="Dataset Creation Result")
|
178 |
+
create_hf_button.click(create_huggingface_dataset, inputs=[dataset_name_input], outputs=hf_result)
|
179 |
|
180 |
view_dataset_button = gr.Button("View Dataset")
|
181 |
dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery", columns=5, rows=4, height="auto")
|
182 |
+
view_dataset_button.click(view_dataset, inputs=[dataset_name_input], outputs=dataset_gallery)
|
183 |
|
184 |
# Launch the interface
|
185 |
+
iface.launch()
|