throaway2854 commited on
Commit
4137e07
·
verified ·
1 Parent(s): 4120679

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import os
4
+ import json
5
+ import gradio as gr
6
+ from datasets import Dataset
7
+ from PIL import Image
8
+ import io
9
+ import uuid
10
+ import shutil
11
+
12
+ DATA_DIR = "/data"
13
+ IMAGES_DIR = os.path.join(DATA_DIR, "images")
14
+ DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
15
+
16
+ def extract_image_url(html_content):
17
+ soup = BeautifulSoup(html_content, 'html.parser')
18
+ script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
19
+
20
+ if script:
21
+ image_data = json.loads(script.string.split('=', 1)[1].strip().rstrip(';'))
22
+ return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
23
+
24
+ img_tag = soup.find('img', alt=True)
25
+ if img_tag and 'src' in img_tag.attrs:
26
+ return img_tag['src']
27
+
28
+ return None
29
+
30
+ def extract_tags(html_content):
31
+ soup = BeautifulSoup(html_content, 'html.parser')
32
+ tag_elements = soup.find_all('li', class_='tag-type-general')
33
+
34
+ tags = []
35
+ for tag_element in tag_elements:
36
+ tag_link = tag_element.find_all('a')[1]
37
+ if tag_link:
38
+ tags.append(tag_link.text)
39
+
40
+ return ','.join(tags)
41
+
42
+ def download_image(url):
43
+ try:
44
+ response = requests.get(url)
45
+ response.raise_for_status()
46
+ return Image.open(io.BytesIO(response.content))
47
+ except requests.RequestException as e:
48
+ raise Exception(f"Failed to download image: {str(e)}")
49
+
50
+ class DatasetBuilder:
51
+ def __init__(self):
52
+ self.dataset = self.load_dataset()
53
+ os.makedirs(IMAGES_DIR, exist_ok=True)
54
+
55
+ def load_dataset(self):
56
+ if os.path.exists(DATASET_FILE):
57
+ with open(DATASET_FILE, 'r') as f:
58
+ return json.load(f)
59
+ return []
60
+
61
+ def save_dataset(self):
62
+ with open(DATASET_FILE, 'w') as f:
63
+ json.dump(self.dataset, f)
64
+
65
+ def add_image(self, url):
66
+ try:
67
+ response = requests.get(url)
68
+ response.raise_for_status()
69
+ html_content = response.text
70
+
71
+ image_url = extract_image_url(html_content)
72
+ if not image_url:
73
+ raise Exception("Failed to extract image URL")
74
+
75
+ tags = extract_tags(html_content)
76
+ image = download_image(image_url)
77
+
78
+ # Generate a unique filename
79
+ filename = f"{uuid.uuid4()}.jpg"
80
+ filepath = os.path.join(IMAGES_DIR, filename)
81
+
82
+ # Save the image
83
+ image.save(filepath)
84
+
85
+ self.dataset.append({
86
+ 'image': filename,
87
+ 'tags': tags
88
+ })
89
+
90
+ self.save_dataset()
91
+ return f"Added image with tags: {tags}"
92
+ except Exception as e:
93
+ return f"Error: {str(e)}"
94
+
95
+ def build_huggingface_dataset(self):
96
+ if not self.dataset:
97
+ return "Dataset is empty. Add some images first."
98
+
99
+ try:
100
+ hf_dataset = Dataset.from_dict({
101
+ 'image': [os.path.join(IMAGES_DIR, item['image']) for item in self.dataset],
102
+ 'tags': [item['tags'] for item in self.dataset]
103
+ })
104
+ return "HuggingFace Dataset created successfully!"
105
+ except Exception as e:
106
+ return f"Error creating HuggingFace Dataset: {str(e)}"
107
+
108
+ def get_dataset_info(self):
109
+ return f"Current dataset size: {len(self.dataset)} images"
110
+
111
+ def get_dataset_preview(self, num_images=5):
112
+ preview = []
113
+ for item in self.dataset[-num_images:]:
114
+ image_path = os.path.join(IMAGES_DIR, item['image'])
115
+ preview.append((image_path, item['tags']))
116
+ return preview
117
+
118
+ dataset_builder = DatasetBuilder()
119
+
120
+ def add_image_to_dataset(url):
121
+ result = dataset_builder.add_image(url)
122
+ return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
123
+
124
+ def create_huggingface_dataset():
125
+ return dataset_builder.build_huggingface_dataset()
126
+
127
+ def view_dataset():
128
+ return dataset_builder.get_dataset_preview(num_images=20)
129
+
130
+ # Create Gradio interface
131
+ with gr.Blocks(theme="huggingface") as iface:
132
+ gr.Markdown("# Image Dataset Builder")
133
+ gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
134
+
135
+ with gr.Row():
136
+ url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
137
+ add_button = gr.Button("Add Image")
138
+
139
+ result_output = gr.Textbox(label="Result")
140
+ dataset_info = gr.Textbox(label="Dataset Info", value=dataset_builder.get_dataset_info())
141
+
142
+ gr.Markdown("## Dataset Preview")
143
+ preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery").style(grid=5, height="auto")
144
+
145
+ add_button.click(add_image_to_dataset, inputs=url_input, outputs=[result_output, dataset_info, preview_gallery])
146
+
147
+ create_hf_button = gr.Button("Create HuggingFace Dataset")
148
+ hf_result = gr.Textbox(label="Dataset Creation Result")
149
+ create_hf_button.click(create_huggingface_dataset, inputs=[], outputs=hf_result)
150
+
151
+ view_dataset_button = gr.Button("View Dataset")
152
+ dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery").style(grid=5, height="auto")
153
+ view_dataset_button.click(view_dataset, inputs=[], outputs=dataset_gallery)
154
+
155
+ # Launch the interface
156
+ iface.launch()