Spaces:

throaway2854
/

datasetbuilder

Paused

App Files Files Community

throaway2854 commited on Aug 18, 2024

Commit

d3d564b

verified ·

1 Parent(s): ade988f

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -50

app.py CHANGED Viewed

@@ -12,9 +12,6 @@ import random
 DATA_DIR = "/data"
 IMAGES_DIR = os.path.join(DATA_DIR, "images")
-DATASET_FILE = os.path.join(DATA_DIR, "dataset.json")
-# Add a user agent rotation list
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
@@ -42,30 +39,16 @@ def make_request(url, cookies=None):
 def extract_image_url(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
-    # First, try to extract the image URL from the <script> tag
     script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
     if script:
         try:
-            # Extract and clean the JavaScript object string
             js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
-            # Log the string for debugging
-            print("Extracted JavaScript object string:", js_object_str)
-            # Replace single quotes with double quotes to make it a valid JSON string
             js_object_str = js_object_str.replace("'", '"')
-            # Parse the JSON object
             image_data = json.loads(js_object_str)
-            # Construct the full image URL
             return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
         except json.JSONDecodeError as e:
             raise Exception(f"Failed to decode JSON: {str(e)}")
-    # If the script tag method fails, try to get the image URL from an <img alt> tag
     img_tag = soup.find('img', alt=True)
     if img_tag and 'src' in img_tag.attrs:
         return img_tag['src']
@@ -74,19 +57,8 @@ def extract_image_url(html_content):
 def extract_tags(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
-    # Find all list items with the relevant class
     tag_elements = soup.find_all('li', class_='tag-type-general')
-    tags = []
-    for tag_element in tag_elements:
-        # The second <a> tag contains the relevant tag name
-        tag_links = tag_element.find_all('a')
-        if len(tag_links) > 1:
-            tag_name = tag_links[1].text
-            tags.append(tag_name)
-    # Join all tags into a single string separated by commas
     return ','.join(tags)
 def download_image(url, cookies=None):
@@ -98,18 +70,24 @@ def download_image(url, cookies=None):
         raise Exception(f"Failed to download image: {str(e)}")
 class DatasetBuilder:
-    def __init__(self):
         self.dataset = self.load_dataset()
         os.makedirs(IMAGES_DIR, exist_ok=True)
     def load_dataset(self):
-        if os.path.exists(DATASET_FILE):
-            with open(DATASET_FILE, 'r') as f:
                 return json.load(f)
         return []
     def save_dataset(self):
-        with open(DATASET_FILE, 'w') as f:
             json.dump(self.dataset, f)
     def add_image(self, url, cookies=None):
@@ -125,11 +103,9 @@ class DatasetBuilder:
             tags = extract_tags(html_content)
             image = download_image(image_url, cookies)
-            # Generate a unique filename
             filename = f"{uuid.uuid4()}.jpg"
             filepath = os.path.join(IMAGES_DIR, filename)
-            # Save the image
             image.save(filepath)
             self.dataset.append({
@@ -156,7 +132,7 @@ class DatasetBuilder:
             return f"Error creating HuggingFace Dataset: {str(e)}"
     def get_dataset_info(self):
-        return f"Current dataset size: {len(self.dataset)} images"
     def get_dataset_preview(self, num_images=5):
         preview = []
@@ -165,17 +141,18 @@ class DatasetBuilder:
             preview.append((image_path, item['tags']))
         return preview
-dataset_builder = DatasetBuilder()
-def add_image_to_dataset(url, cookies):
-    result = dataset_builder.add_image(url, cookies)
-    return result, dataset_builder.get_dataset_info(), dataset_builder.get_dataset_preview()
-def create_huggingface_dataset():
-    return dataset_builder.build_huggingface_dataset()
-def view_dataset():
-    return dataset_builder.get_dataset_preview(num_images=20)
 # Create Gradio interface
 with gr.Blocks(theme="huggingface") as iface:
@@ -183,25 +160,26 @@ with gr.Blocks(theme="huggingface") as iface:
     gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
     with gr.Row():
         url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
         cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
         add_button = gr.Button("Add Image")
     result_output = gr.Textbox(label="Result")
-    dataset_info = gr.Textbox(label="Dataset Info", value=dataset_builder.get_dataset_info())
     gr.Markdown("## Dataset Preview")
     preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
-    add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input], outputs=[result_output, dataset_info, preview_gallery])
     create_hf_button = gr.Button("Create HuggingFace Dataset")
     hf_result = gr.Textbox(label="Dataset Creation Result")
-    create_hf_button.click(create_huggingface_dataset, inputs=[], outputs=hf_result)
     view_dataset_button = gr.Button("View Dataset")
     dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery", columns=5, rows=4, height="auto")
-    view_dataset_button.click(view_dataset, inputs=[], outputs=dataset_gallery)
 # Launch the interface
-iface.launch()

 DATA_DIR = "/data"
 IMAGES_DIR = os.path.join(DATA_DIR, "images")
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
 def extract_image_url(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     script = soup.find('script', type='text/javascript', string=lambda text: 'image =' in text if text else False)
     if script:
         try:
             js_object_str = script.string.split('=', 1)[1].strip().rstrip(';')
             js_object_str = js_object_str.replace("'", '"')
             image_data = json.loads(js_object_str)
             return f"{image_data['domain']}{image_data['base_dir']}/{image_data['dir']}/{image_data['img']}"
         except json.JSONDecodeError as e:
             raise Exception(f"Failed to decode JSON: {str(e)}")
     img_tag = soup.find('img', alt=True)
     if img_tag and 'src' in img_tag.attrs:
         return img_tag['src']
 def extract_tags(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     tag_elements = soup.find_all('li', class_='tag-type-general')
+    tags = [tag_element.find_all('a')[1].text for tag_element in tag_elements if len(tag_element.find_all('a')) > 1]
     return ','.join(tags)
 def download_image(url, cookies=None):
         raise Exception(f"Failed to download image: {str(e)}")
 class DatasetBuilder:
+    def __init__(self, dataset_name):
+        self.dataset_name = dataset_name
         self.dataset = self.load_dataset()
         os.makedirs(IMAGES_DIR, exist_ok=True)
+    def get_dataset_file(self):
+        return os.path.join(DATA_DIR, f"{self.dataset_name}.json")
     def load_dataset(self):
+        dataset_file = self.get_dataset_file()
+        if os.path.exists(dataset_file):
+            with open(dataset_file, 'r') as f:
                 return json.load(f)
         return []
     def save_dataset(self):
+        dataset_file = self.get_dataset_file()
+        with open(dataset_file, 'w') as f:
             json.dump(self.dataset, f)
     def add_image(self, url, cookies=None):
             tags = extract_tags(html_content)
             image = download_image(image_url, cookies)
             filename = f"{uuid.uuid4()}.jpg"
             filepath = os.path.join(IMAGES_DIR, filename)
             image.save(filepath)
             self.dataset.append({
             return f"Error creating HuggingFace Dataset: {str(e)}"
     def get_dataset_info(self):
+        return f"Current dataset size ({self.dataset_name}): {len(self.dataset)} images"
     def get_dataset_preview(self, num_images=5):
         preview = []
             preview.append((image_path, item['tags']))
         return preview
+def add_image_to_dataset(url, cookies, dataset_name):
+    builder = DatasetBuilder(dataset_name)
+    result = builder.add_image(url, cookies)
+    return result, builder.get_dataset_info(), builder.get_dataset_preview()
+def create_huggingface_dataset(dataset_name):
+    builder = DatasetBuilder(dataset_name)
+    return builder.build_huggingface_dataset()
+def view_dataset(dataset_name):
+    builder = DatasetBuilder(dataset_name)
+    return builder.get_dataset_preview(num_images=20)
 # Create Gradio interface
 with gr.Blocks(theme="huggingface") as iface:
     gr.Markdown("Enter a URL to add an image and its tags to the dataset. Progress is saved automatically.")
     with gr.Row():
+        dataset_name_input = gr.Textbox(lines=1, placeholder="Enter dataset name...", value="default_dataset")
         url_input = gr.Textbox(lines=2, placeholder="Enter image URL here...")
         cookies_input = gr.Textbox(lines=2, placeholder="Enter cookies (optional)")
         add_button = gr.Button("Add Image")
     result_output = gr.Textbox(label="Result")
+    dataset_info = gr.Textbox(label="Dataset Info")
     gr.Markdown("## Dataset Preview")
     preview_gallery = gr.Gallery(label="Recent Additions", show_label=False, elem_id="preview_gallery", columns=5, rows=1, height="auto")
+    add_button.click(add_image_to_dataset, inputs=[url_input, cookies_input, dataset_name_input], outputs=[result_output, dataset_info, preview_gallery])
     create_hf_button = gr.Button("Create HuggingFace Dataset")
     hf_result = gr.Textbox(label="Dataset Creation Result")
+    create_hf_button.click(create_huggingface_dataset, inputs=[dataset_name_input], outputs=hf_result)
     view_dataset_button = gr.Button("View Dataset")
     dataset_gallery = gr.Gallery(label="Dataset Contents", show_label=False, elem_id="dataset_gallery", columns=5, rows=4, height="auto")
+    view_dataset_button.click(view_dataset, inputs=[dataset_name_input], outputs=dataset_gallery)
 # Launch the interface
+iface.launch()