throaway2854 commited on
Commit
d3d564b
·
verified ·
1 Parent(s): ade988f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -50
app.py CHANGED
@@ -12,9 +12,6 @@ import random
12
 
13
  DATA_DIR = "/data"
14
  IMAGES_DIR = os.path.join(DATA_DIR, "images")
15
- DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
16
-
17
- # Add a user agent rotation list
18
  USER_AGENTS = [
19
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
20
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
@@ -42,30 +39,16 @@ def make_request(url, cookies=None):
42
  def extract_image_url(html_content):
43
  soup = BeautifulSoup(html_content, 'html.parser')
44
 
45
- # First, try to extract the image URL from the <script> tag
46
  script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
47
-
48
  if script:
49
  try:
50
- # Extract and clean the JavaScript object string
51
  js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
52
-
53
- # Log the string for debugging
54
- print("Extracted JavaScript object string:", js_object_str)
55
-
56
- # Replace single quotes with double quotes to make it a valid JSON string
57
  js_object_str = js_object_str.replace("'", '"')
58
-
59
- # Parse the JSON object
60
  image_data = json.loads(js_object_str)
61
-
62
- # Construct the full image URL
63
  return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
64
-
65
  except json.JSONDecodeError as e:
66
  raise Exception(f"Failed to decode JSON: {str(e)}")
67
 
68
- # If the script tag method fails, try to get the image URL from an <img alt> tag
69
  img_tag = soup.find('img', alt=True)
70
  if img_tag and 'src' in img_tag.attrs:
71
  return img_tag['src']
@@ -74,19 +57,8 @@ def extract_image_url(html_content):
74
 
75
  def extract_tags(html_content):
76
  soup = BeautifulSoup(html_content, 'html.parser')
77
-
78
- # Find all list items with the relevant class
79
  tag_elements = soup.find_all('li', class_='tag-type-general')
80
-
81
- tags = []
82
- for tag_element in tag_elements:
83
- # The second <a> tag contains the relevant tag name
84
- tag_links = tag_element.find_all('a')
85
- if len(tag_links) > 1:
86
- tag_name = tag_links[1].text
87
- tags.append(tag_name)
88
-
89
- # Join all tags into a single string separated by commas
90
  return ','.join(tags)
91
 
92
  def download_image(url, cookies=None):
@@ -98,18 +70,24 @@ def download_image(url, cookies=None):
98
  raise Exception(f"Failed to download image: {str(e)}")
99
 
100
  class DatasetBuilder:
101
- def __init__(self):
 
102
  self.dataset = self.load_dataset()
103
  os.makedirs(IMAGES_DIR, exist_ok=True)
104
 
 
 
 
105
  def load_dataset(self):
106
- if os.path.exists(DATASET_FILE):
107
- with open(DATASET_FILE, 'r') as f:
 
108
  return json.load(f)
109
  return []
110
 
111
  def save_dataset(self):
112
- with open(DATASET_FILE, 'w') as f:
 
113
  json.dump(self.dataset, f)
114
 
115
  def add_image(self, url, cookies=None):
@@ -125,11 +103,9 @@ class DatasetBuilder:
125
  tags = extract_tags(html_content)
126
  image = download_image(image_url, cookies)
127
 
128
- # Generate a unique filename
129
  filename = f"{uuid.uuid4()}.jpg"
130
  filepath = os.path.join(IMAGES_DIR, filename)
131
 
132
- # Save the image
133
  image.save(filepath)
134
 
135
  self.dataset.append({
@@ -156,7 +132,7 @@ class DatasetBuilder:
156
  return f"Error creating HuggingFace Dataset: {str(e)}"
157
 
158
  def get_dataset_info(self):
159
- return f"Current dataset size: {len(self.dataset)} images"
160
 
161
  def get_dataset_preview(self, num_images=5):
162
  preview = []
@@ -165,17 +141,18 @@ class DatasetBuilder:
165
  preview.append((image_path, item['tags']))
166
  return preview
167
 
168
- dataset_builder = DatasetBuilder()
169
-
170
- def add_image_to_dataset(url, cookies):
171
- result = dataset_builder.add_image(url, cookies)
172
- return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
173
 
174
- def create_huggingface_dataset():
175
- return dataset_builder.build_huggingface_dataset()
 
176
 
177
- def view_dataset():
178
- return dataset_builder.get_dataset_preview(num_images=20)
 
179
 
180
  # Create Gradio interface
181
  with gr.Blocks(theme="huggingface") as iface:
@@ -183,25 +160,26 @@ with gr.Blocks(theme="huggingface") as iface:
183
  gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
184
 
185
  with gr.Row():
 
186
  url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
187
  cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
188
  add_button = gr.Button("Add Image")
189
 
190
  result_output = gr.Textbox(label="Result")
191
- dataset_info = gr.Textbox(label="Dataset Info", value=dataset_builder.get_dataset_info())
192
 
193
  gr.Markdown("## Dataset Preview")
194
  preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
195
 
196
- add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input], outputs=[result_output, dataset_info, preview_gallery])
197
 
198
  create_hf_button = gr.Button("Create HuggingFace Dataset")
199
  hf_result = gr.Textbox(label="Dataset Creation Result")
200
- create_hf_button.click(create_huggingface_dataset, inputs=[], outputs=hf_result)
201
 
202
  view_dataset_button = gr.Button("View Dataset")
203
  dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery", columns=5, rows=4, height="auto")
204
- view_dataset_button.click(view_dataset, inputs=[], outputs=dataset_gallery)
205
 
206
  # Launch the interface
207
- iface.launch()
 
12
 
13
  DATA_DIR = "/data"
14
  IMAGES_DIR = os.path.join(DATA_DIR, "images")
 
 
 
15
  USER_AGENTS = [
16
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
17
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
 
39
  def extract_image_url(html_content):
40
  soup = BeautifulSoup(html_content, 'html.parser')
41
 
 
42
  script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
 
43
  if script:
44
  try:
 
45
  js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
 
 
 
 
 
46
  js_object_str = js_object_str.replace("'", '"')
 
 
47
  image_data = json.loads(js_object_str)
 
 
48
  return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
 
49
  except json.JSONDecodeError as e:
50
  raise Exception(f"Failed to decode JSON: {str(e)}")
51
 
 
52
  img_tag = soup.find('img', alt=True)
53
  if img_tag and 'src' in img_tag.attrs:
54
  return img_tag['src']
 
57
 
58
  def extract_tags(html_content):
59
  soup = BeautifulSoup(html_content, 'html.parser')
 
 
60
  tag_elements = soup.find_all('li', class_='tag-type-general')
61
+ tags = [tag_element.find_all('a')[1].text for tag_element in tag_elements if len(tag_element.find_all('a')) > 1]
 
 
 
 
 
 
 
 
 
62
  return ','.join(tags)
63
 
64
  def download_image(url, cookies=None):
 
70
  raise Exception(f"Failed to download image: {str(e)}")
71
 
72
  class DatasetBuilder:
73
+ def __init__(self, dataset_name):
74
+ self.dataset_name = dataset_name
75
  self.dataset = self.load_dataset()
76
  os.makedirs(IMAGES_DIR, exist_ok=True)
77
 
78
+ def get_dataset_file(self):
79
+ return os.path.join(DATA_DIR, f"{self.dataset_name}.json")
80
+
81
  def load_dataset(self):
82
+ dataset_file = self.get_dataset_file()
83
+ if os.path.exists(dataset_file):
84
+ with open(dataset_file, 'r') as f:
85
  return json.load(f)
86
  return []
87
 
88
  def save_dataset(self):
89
+ dataset_file = self.get_dataset_file()
90
+ with open(dataset_file, 'w') as f:
91
  json.dump(self.dataset, f)
92
 
93
  def add_image(self, url, cookies=None):
 
103
  tags = extract_tags(html_content)
104
  image = download_image(image_url, cookies)
105
 
 
106
  filename = f"{uuid.uuid4()}.jpg"
107
  filepath = os.path.join(IMAGES_DIR, filename)
108
 
 
109
  image.save(filepath)
110
 
111
  self.dataset.append({
 
132
  return f"Error creating HuggingFace Dataset: {str(e)}"
133
 
134
  def get_dataset_info(self):
135
+ return f"Current dataset size ({self.dataset_name}): {len(self.dataset)} images"
136
 
137
  def get_dataset_preview(self, num_images=5):
138
  preview = []
 
141
  preview.append((image_path, item['tags']))
142
  return preview
143
 
144
+ def add_image_to_dataset(url, cookies, dataset_name):
145
+ builder = DatasetBuilder(dataset_name)
146
+ result = builder.add_image(url, cookies)
147
+ return result, builder.get_dataset_info(), builder.get_dataset_preview()
 
148
 
149
+ def create_huggingface_dataset(dataset_name):
150
+ builder = DatasetBuilder(dataset_name)
151
+ return builder.build_huggingface_dataset()
152
 
153
+ def view_dataset(dataset_name):
154
+ builder = DatasetBuilder(dataset_name)
155
+ return builder.get_dataset_preview(num_images=20)
156
 
157
  # Create Gradio interface
158
  with gr.Blocks(theme="huggingface") as iface:
 
160
  gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
161
 
162
  with gr.Row():
163
+ dataset_name_input = gr.Textbox(lines=1, placeholder="Enter dataset name...", value="default_dataset")
164
  url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
165
  cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
166
  add_button = gr.Button("Add Image")
167
 
168
  result_output = gr.Textbox(label="Result")
169
+ dataset_info = gr.Textbox(label="Dataset Info")
170
 
171
  gr.Markdown("## Dataset Preview")
172
  preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
173
 
174
+ add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input, dataset_name_input], outputs=[result_output, dataset_info, preview_gallery])
175
 
176
  create_hf_button = gr.Button("Create HuggingFace Dataset")
177
  hf_result = gr.Textbox(label="Dataset Creation Result")
178
+ create_hf_button.click(create_huggingface_dataset, inputs=[dataset_name_input], outputs=hf_result)
179
 
180
  view_dataset_button = gr.Button("View Dataset")
181
  dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery", columns=5, rows=4, height="auto")
182
+ view_dataset_button.click(view_dataset, inputs=[dataset_name_input], outputs=dataset_gallery)
183
 
184
  # Launch the interface
185
+ iface.launch()