throaway2854 commited on
Commit
1f07e5c
·
verified ·
1 Parent(s): c6b4f6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -8
app.py CHANGED
@@ -7,11 +7,38 @@ from datasets import Dataset
7
  from PIL import Image
8
  import io
9
  import uuid
 
 
10
 
11
  DATA_DIR = "/data"
12
  IMAGES_DIR = os.path.join(DATA_DIR, "images")
13
  DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def extract_image_url(html_content):
16
  soup = BeautifulSoup(html_content, 'html.parser')
17
  script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
@@ -38,9 +65,9 @@ def extract_tags(html_content):
38
 
39
  return ','.join(tags)
40
 
41
- def download_image(url):
42
  try:
43
- response = requests.get(url)
44
  response.raise_for_status()
45
  return Image.open(io.BytesIO(response.content))
46
  except requests.RequestException as e:
@@ -61,9 +88,9 @@ class DatasetBuilder:
61
  with open(DATASET_FILE, 'w') as f:
62
  json.dump(self.dataset, f)
63
 
64
- def add_image(self, url):
65
  try:
66
- response = requests.get(url)
67
  response.raise_for_status()
68
  html_content = response.text
69
 
@@ -72,7 +99,7 @@ class DatasetBuilder:
72
  raise Exception("Failed to extract image URL")
73
 
74
  tags = extract_tags(html_content)
75
- image = download_image(image_url)
76
 
77
  # Generate a unique filename
78
  filename = f"{uuid.uuid4()}.jpg"
@@ -116,8 +143,8 @@ class DatasetBuilder:
116
 
117
  dataset_builder = DatasetBuilder()
118
 
119
- def add_image_to_dataset(url):
120
- result = dataset_builder.add_image(url)
121
  return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
122
 
123
  def create_huggingface_dataset():
@@ -133,6 +160,7 @@ with gr.Blocks(theme="huggingface") as iface:
133
 
134
  with gr.Row():
135
  url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
 
136
  add_button = gr.Button("Add Image")
137
 
138
  result_output = gr.Textbox(label="Result")
@@ -141,7 +169,7 @@ with gr.Blocks(theme="huggingface") as iface:
141
  gr.Markdown("## Dataset Preview")
142
  preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
143
 
144
- add_button.click(add_image_to_dataset, inputs=url_input, outputs=[result_output, dataset_info, preview_gallery])
145
 
146
  create_hf_button = gr.Button("Create HuggingFace Dataset")
147
  hf_result = gr.Textbox(label="Dataset Creation Result")
 
7
  from PIL import Image
8
  import io
9
  import uuid
10
+ import time
11
+ import random
12
 
13
  DATA_DIR = "/data"
14
  IMAGES_DIR = os.path.join(DATA_DIR, "images")
15
  DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
16
 
17
+ # Add a user agent rotation list
18
+ USER_AGENTS = [
19
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
20
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
21
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
22
+ ]
23
+
24
+ def get_headers(cookies=None):
25
+ headers = {
26
+ "User-Agent": random.choice(USER_AGENTS),
27
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
28
+ "Accept-Language": "en-US,en;q=0.5",
29
+ "Referer": "https://www.google.com/",
30
+ "DNT": "1",
31
+ "Connection": "keep-alive",
32
+ "Upgrade-Insecure-Requests": "1"
33
+ }
34
+ if cookies:
35
+ headers["Cookie"] = cookies
36
+ return headers
37
+
38
+ def make_request(url, cookies=None):
39
+ time.sleep(random.uniform(1, 3)) # Add a random delay between requests
40
+ return requests.get(url, headers=get_headers(cookies), timeout=10)
41
+
42
  def extract_image_url(html_content):
43
  soup = BeautifulSoup(html_content, 'html.parser')
44
  script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
 
65
 
66
  return ','.join(tags)
67
 
68
+ def download_image(url, cookies=None):
69
  try:
70
+ response = make_request(url, cookies)
71
  response.raise_for_status()
72
  return Image.open(io.BytesIO(response.content))
73
  except requests.RequestException as e:
 
88
  with open(DATASET_FILE, 'w') as f:
89
  json.dump(self.dataset, f)
90
 
91
+ def add_image(self, url, cookies=None):
92
  try:
93
+ response = make_request(url, cookies)
94
  response.raise_for_status()
95
  html_content = response.text
96
 
 
99
  raise Exception("Failed to extract image URL")
100
 
101
  tags = extract_tags(html_content)
102
+ image = download_image(image_url, cookies)
103
 
104
  # Generate a unique filename
105
  filename = f"{uuid.uuid4()}.jpg"
 
143
 
144
  dataset_builder = DatasetBuilder()
145
 
146
+ def add_image_to_dataset(url, cookies):
147
+ result = dataset_builder.add_image(url, cookies)
148
  return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
149
 
150
  def create_huggingface_dataset():
 
160
 
161
  with gr.Row():
162
  url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
163
+ cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
164
  add_button = gr.Button("Add Image")
165
 
166
  result_output = gr.Textbox(label="Result")
 
169
  gr.Markdown("## Dataset Preview")
170
  preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
171
 
172
+ add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input], outputs=[result_output, dataset_info, preview_gallery])
173
 
174
  create_hf_button = gr.Button("Create HuggingFace Dataset")
175
  hf_result = gr.Textbox(label="Dataset Creation Result")