Wauplin HF staff commited on
Commit
e027770
·
1 Parent(s): b89842d

Add parquet example

Browse files
Files changed (3) hide show
  1. app.py +43 -3
  2. app_parquet.py +239 -0
  3. requirements.txt +3 -1
app.py CHANGED
@@ -1,6 +1,8 @@
1
  # Start by setting token and debug mode before starting schedulers
2
  import os
 
3
  from huggingface_hub import logging, login
 
4
  login(token=os.environ.get("HF_TOKEN"), write_permission=True)
5
  logging.set_verbosity_debug()
6
 
@@ -12,6 +14,8 @@ import gradio as gr
12
  from app_1M_image import get_demo as get_demo_1M_image
13
  from app_image import get_demo as get_demo_image
14
  from app_json import get_demo as get_demo_json
 
 
15
 
16
  def _get_demo_code(path: str) -> str:
17
  code = Path(path).read_text()
@@ -80,7 +84,30 @@ Works with concurrent users and replicas.
80
 
81
  ## Limitations
82
 
83
- None.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  ## Demo
86
  """
@@ -91,14 +118,18 @@ with gr.Blocks() as demo:
91
  with gr.Tab("JSON Dataset"):
92
  gr.Markdown(JSON_DEMO_EXPLANATION)
93
  get_demo_json()
94
- gr.Markdown("## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-json\n\n## Code")
 
 
95
  with gr.Accordion("Source code", open=True):
96
  gr.Code(_get_demo_code("app_json.py"), language="python")
97
 
98
  with gr.Tab("Image Dataset"):
99
  gr.Markdown(IMAGE_DEMO_EXPLANATION)
100
  get_demo_image()
101
- gr.Markdown("## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-image\n\n## Code")
 
 
102
  with gr.Accordion("Source code", open=True):
103
  gr.Code(_get_demo_code("app_image.py"), language="python")
104
 
@@ -110,4 +141,13 @@ with gr.Blocks() as demo:
110
  )
111
  with gr.Accordion("Source code", open=True):
112
  gr.Code(_get_demo_code("app_1M_image.py"), language="python")
 
 
 
 
 
 
 
 
 
113
  demo.launch()
 
1
  # Start by setting token and debug mode before starting schedulers
2
  import os
3
+
4
  from huggingface_hub import logging, login
5
+
6
  login(token=os.environ.get("HF_TOKEN"), write_permission=True)
7
  logging.set_verbosity_debug()
8
 
 
14
  from app_1M_image import get_demo as get_demo_1M_image
15
  from app_image import get_demo as get_demo_image
16
  from app_json import get_demo as get_demo_json
17
+ from app_parquet import get_demo as get_demo_parquet
18
+
19
 
20
  def _get_demo_code(path: str) -> str:
21
  code = Path(path).read_text()
 
84
 
85
  ## Limitations
86
 
87
+ Only 1 image per row. This is fine for most image datasets. However in some cases you might want to save multiple images per row
88
+ (e.g. generate 4 images and select the preferred one). In this case, you must encode how the dataset must be saved, as
89
+ a parquet file. Please have a look to the Parquet example for more details.
90
+
91
+ ## Demo
92
+ """
93
+
94
+ PARQUET_DEMO_EXPLANATION = """
95
+ ## Use case:
96
+
97
+ Save any arbitrary dataset. Each row can contain metadata (text, numbers, datetimes,...) as well as binary data
98
+ (images, audio, video,...). This is particularly for datasets with multiple binary files for each row:
99
+
100
+ - Generate multiple images and select preferred one.
101
+ - Take audio as input, generate a translated audio as output.
102
+
103
+ ## Robustness
104
+
105
+ Works with concurrent users and replicas.
106
+
107
+ ## Limitations
108
+
109
+ None. Implementation of the ParquetScheduler requires slightly more work but you get full control over the data that is
110
+ pushed to the Hub.
111
 
112
  ## Demo
113
  """
 
118
  with gr.Tab("JSON Dataset"):
119
  gr.Markdown(JSON_DEMO_EXPLANATION)
120
  get_demo_json()
121
+ gr.Markdown(
122
+ "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-json\n\n## Code"
123
+ )
124
  with gr.Accordion("Source code", open=True):
125
  gr.Code(_get_demo_code("app_json.py"), language="python")
126
 
127
  with gr.Tab("Image Dataset"):
128
  gr.Markdown(IMAGE_DEMO_EXPLANATION)
129
  get_demo_image()
130
+ gr.Markdown(
131
+ "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-image\n\n## Code"
132
+ )
133
  with gr.Accordion("Source code", open=True):
134
  gr.Code(_get_demo_code("app_image.py"), language="python")
135
 
 
141
  )
142
  with gr.Accordion("Source code", open=True):
143
  gr.Code(_get_demo_code("app_1M_image.py"), language="python")
144
+
145
+ with gr.Tab("Parquet Dataset"):
146
+ gr.Markdown(PARQUET_DEMO_EXPLANATION)
147
+ get_demo_parquet()
148
+ gr.Markdown(
149
+ "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-parquet\n\n## Code"
150
+ )
151
+ with gr.Accordion("Source code", open=True):
152
+ gr.Code(_get_demo_code("app_parquet.py"), language="python")
153
  demo.launch()
app_parquet.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import os
4
+ import shutil
5
+ import tempfile
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List
9
+
10
+ import gradio as gr
11
+ import pyarrow as pa
12
+ import pyarrow.parquet as pq
13
+ from gradio_client import Client
14
+ from huggingface_hub import CommitScheduler
15
+
16
+ #######################
17
+ # Parquet scheduler #
18
+ # Run in scheduler.py #
19
+ #######################
20
+
21
+
22
+ class ParquetScheduler(CommitScheduler):
23
+ def append(self, row: Dict[str, Any]) -> None:
24
+ with self.lock:
25
+ if not hasattr(self, "rows") or self.rows is None:
26
+ self.rows = []
27
+ self.rows.append(row)
28
+
29
+ def set_schema(self, schema: Dict[str, Dict[str, str]]) -> None:
30
+ """
31
+ Define a schema to help `datasets` load the generated library.
32
+
33
+ This method is optional and can be called once just after the scheduler had been created. If it is not called,
34
+ the schema is automatically inferred before pushing the data to the Hub.
35
+
36
+ See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value for the list of
37
+ possible values.
38
+
39
+ Example:
40
+ ```py
41
+ scheduler.set_schema({
42
+ "prompt": {"_type": "Value", "dtype": "string"},
43
+ "negative_prompt": {"_type": "Value", "dtype": "string"},
44
+ "guidance_scale": {"_type": "Value", "dtype": "int64"},
45
+ "image": {"_type": "Image"},
46
+ })
47
+ ```
48
+ """
49
+ self._schema = schema
50
+
51
+ def push_to_hub(self):
52
+ # Check for new rows to push
53
+ with self.lock:
54
+ rows = getattr(self, "rows", None)
55
+ self.rows = None
56
+ if not rows:
57
+ return
58
+ print(f"Got {len(rows)} item(s) to commit.")
59
+
60
+ # Load images + create 'features' config for datasets library
61
+ hf_features: Dict[str, Dict] = getattr(self, "_schema", None) or {}
62
+ path_to_cleanup: List[Path] = []
63
+ for row in rows:
64
+ for key, value in row.items():
65
+ # Infer schema (for `datasets` library)
66
+ if key not in hf_features:
67
+ hf_features[key] = _infer_schema(key, value)
68
+
69
+ # Load binary files if necessary
70
+ if hf_features[key]["_type"] in ("Image", "Audio"):
71
+ # It's an image or audio: we load the bytes and remember to cleanup the file
72
+ file_path = Path(value)
73
+ if file_path.is_file():
74
+ row[key] = {
75
+ "path": file_path.name,
76
+ "bytes": file_path.read_bytes(),
77
+ }
78
+ path_to_cleanup.append(file_path)
79
+
80
+ # Complete rows if needed
81
+ for row in rows:
82
+ for feature in hf_features:
83
+ if feature not in row:
84
+ row[feature] = None
85
+
86
+ # Export items to Arrow format
87
+ table = pa.Table.from_pylist(rows)
88
+
89
+ # Add metadata (used by datasets library)
90
+ table = table.replace_schema_metadata(
91
+ {"huggingface": json.dumps({"info": {"features": hf_features}})}
92
+ )
93
+
94
+ # Write to parquet file
95
+ archive_file = tempfile.NamedTemporaryFile()
96
+ pq.write_table(table, archive_file.name)
97
+
98
+ # Upload
99
+ self.api.upload_file(
100
+ repo_id=self.repo_id,
101
+ repo_type=self.repo_type,
102
+ revision=self.revision,
103
+ path_in_repo=f"{uuid.uuid4()}.parquet",
104
+ path_or_fileobj=archive_file.name,
105
+ )
106
+ print(f"Commit completed.")
107
+
108
+ # Cleanup
109
+ archive_file.close()
110
+ for path in path_to_cleanup:
111
+ path.unlink(missing_ok=True)
112
+
113
+
114
+ def _infer_schema(key: str, value: Any) -> Dict[str, str]:
115
+ """
116
+ Infer schema for the `datasets` library.
117
+
118
+ See https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Value.
119
+ """
120
+ if "image" in key:
121
+ return {"_type": "Image"}
122
+ if "audio" in key:
123
+ return {"_type": "Audio"}
124
+ if isinstance(value, int):
125
+ return {"_type": "Value", "dtype": "int64"}
126
+ if isinstance(value, float):
127
+ return {"_type": "Value", "dtype": "float64"}
128
+ if isinstance(value, bool):
129
+ return {"_type": "Value", "dtype": "bool"}
130
+ if isinstance(value, bytes):
131
+ return {"_type": "Value", "dtype": "binary"}
132
+ # Otherwise in last resort => convert it to a string
133
+ return {"_type": "Value", "dtype": "string"}
134
+
135
+
136
+ #################
137
+ # Gradio app #
138
+ # Run in app.py #
139
+ #################
140
+
141
+ PARQUET_DATASET_DIR = Path("parquet_dataset")
142
+ PARQUET_DATASET_DIR.mkdir(parents=True, exist_ok=True)
143
+
144
+ scheduler = ParquetScheduler(
145
+ repo_id="example-space-to-dataset-parquet",
146
+ repo_type="dataset",
147
+ folder_path=PARQUET_DATASET_DIR,
148
+ path_in_repo="data",
149
+ )
150
+
151
+ client = Client("stabilityai/stable-diffusion")
152
+
153
+
154
+ def generate(prompt: str) -> tuple[str, list[str]]:
155
+ """Generate images on 'submit' button."""
156
+ # Generate from https://huggingface.co/spaces/stabilityai/stable-diffusion
157
+ out_dir = client.predict(prompt, "", 9, fn_index=1)
158
+ with (Path(out_dir) / "captions.json").open() as f:
159
+ paths = list(json.load(f).keys())
160
+
161
+ # Save config used to generate data
162
+ with tempfile.NamedTemporaryFile(
163
+ mode="w", suffix=".json", delete=False
164
+ ) as config_file:
165
+ json.dump(
166
+ {"prompt": prompt, "negative_prompt": "", "guidance_scale": 9}, config_file
167
+ )
168
+
169
+ return config_file.name, paths
170
+
171
+
172
+ def get_selected_index(evt: gr.SelectData) -> int:
173
+ """Select "best" image."""
174
+ return evt.index
175
+
176
+
177
+ def save_preference(
178
+ config_path: str, gallery: list[dict[str, Any]], selected_index: int
179
+ ) -> None:
180
+ """Save preference, i.e. move images to a new folder and send paths+config to scheduler."""
181
+ save_dir = PARQUET_DATASET_DIR / f"{uuid.uuid4()}"
182
+ save_dir.mkdir(parents=True, exist_ok=True)
183
+
184
+ # Load config
185
+ with open(config_path) as f:
186
+ data = json.load(f)
187
+
188
+ # Add selected item + timestamp
189
+ data["selected_index"] = selected_index
190
+ data["timestamp"] = datetime.datetime.utcnow().isoformat()
191
+
192
+ # Copy and add images
193
+ for index, path in enumerate(x["name"] for x in gallery):
194
+ name = f"{index:03d}"
195
+ dst_path = save_dir / f"{name}{Path(path).suffix}"
196
+ shutil.move(path, dst_path)
197
+ data[f"image_{name}"] = dst_path
198
+
199
+ # Send to scheduler
200
+ scheduler.append(data)
201
+
202
+
203
+ def clear() -> tuple[dict, dict, dict]:
204
+ """Clear all values once saved."""
205
+ return (gr.update(value=None), gr.update(value=None), gr.update(interactive=False))
206
+
207
+
208
+ def get_demo():
209
+ with gr.Group():
210
+ prompt = gr.Text(show_label=False, placeholder="Prompt")
211
+ config_path = gr.Text(visible=False)
212
+ gallery = gr.Gallery(show_label=False).style(
213
+ columns=2, rows=2, height="600px", object_fit="scale-down"
214
+ )
215
+ selected_index = gr.Number(visible=False, precision=0)
216
+ save_preference_button = gr.Button("Save preference", interactive=False)
217
+
218
+ # Generate images on submit
219
+ prompt.submit(fn=generate, inputs=prompt, outputs=[config_path, gallery],).success(
220
+ fn=lambda: gr.update(interactive=True),
221
+ outputs=save_preference_button,
222
+ queue=False,
223
+ )
224
+
225
+ # Save preference on click
226
+ gallery.select(
227
+ fn=get_selected_index,
228
+ outputs=selected_index,
229
+ queue=False,
230
+ )
231
+ save_preference_button.click(
232
+ fn=save_preference,
233
+ inputs=[config_path, gallery, selected_index],
234
+ queue=False,
235
+ ).then(
236
+ fn=clear,
237
+ outputs=[config_path, gallery, save_preference_button],
238
+ queue=False,
239
+ )
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- git+https://github.com/huggingface/huggingface_hub
 
 
 
1
+ git+https://github.com/huggingface/huggingface_hub
2
+ gradio_client==0.2.6
3
+ pyarrow==12.0.1