radames commited on
Commit
edcf6dc
·
1 Parent(s): 5c6a629

use newest diffusers

Browse files
app-img2img.py CHANGED
@@ -9,9 +9,10 @@ from fastapi.middleware.cors import CORSMiddleware
9
  from fastapi.responses import StreamingResponse, JSONResponse
10
  from fastapi.staticfiles import StaticFiles
11
 
12
- from diffusers import DiffusionPipeline, AutoencoderTiny
13
  from compel import Compel
14
  import torch
 
15
  try:
16
  import intel_extension_for_pytorch as ipex
17
  except:
@@ -31,12 +32,14 @@ SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
31
  WIDTH = 512
32
  HEIGHT = 512
33
  # disable tiny autoencoder for better quality speed tradeoff
34
- USE_TINY_AUTOENCODER=True
35
 
36
  # check if MPS is available OSX only M1/M2/M3 chips
37
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
38
- xpu_available = hasattr(torch, 'xpu') and torch.xpu.is_available()
39
- device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if xpu_available else "cpu")
 
 
40
  torch_device = device
41
 
42
  # change to torch.float16 to save GPU memory
@@ -53,17 +56,13 @@ if mps_available:
53
  torch_dtype = torch.float32
54
 
55
  if SAFETY_CHECKER == "True":
56
- pipe = DiffusionPipeline.from_pretrained(
57
  "SimianLuo/LCM_Dreamshaper_v7",
58
- custom_pipeline="latent_consistency_img2img.py",
59
- custom_revision="main",
60
  )
61
  else:
62
- pipe = DiffusionPipeline.from_pretrained(
63
  "SimianLuo/LCM_Dreamshaper_v7",
64
  safety_checker=None,
65
- custom_pipeline="latent_consistency_img2img.py",
66
- custom_revision="main",
67
  )
68
 
69
  if USE_TINY_AUTOENCODER:
@@ -71,7 +70,7 @@ if USE_TINY_AUTOENCODER:
71
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
72
  )
73
  pipe.set_progress_bar_config(disable=True)
74
- pipe.to(torch_device=torch_device, torch_dtype=torch_dtype).to(device)
75
  pipe.unet.to(memory_format=torch.channels_last)
76
 
77
  if psutil.virtual_memory().total < 64 * 1024**3:
@@ -98,7 +97,9 @@ class InputParams(BaseModel):
98
  height: int = HEIGHT
99
 
100
 
101
- def predict(input_image: Image.Image, params: InputParams, prompt_embeds: torch.Tensor = None):
 
 
102
  generator = torch.manual_seed(params.seed)
103
  # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
104
  num_inference_steps = 3
@@ -111,7 +112,7 @@ def predict(input_image: Image.Image, params: InputParams, prompt_embeds: torch.
111
  guidance_scale=params.guidance_scale,
112
  width=params.width,
113
  height=params.height,
114
- lcm_origin_steps=50,
115
  output_type="pil",
116
  )
117
  nsfw_content_detected = (
@@ -181,6 +182,7 @@ async def stream(user_id: uuid.UUID):
181
  try:
182
  user_queue = user_queue_map[uid]
183
  queue = user_queue["queue"]
 
184
  async def generate():
185
  last_prompt: str = None
186
  prompt_embeds: torch.Tensor = None
 
9
  from fastapi.responses import StreamingResponse, JSONResponse
10
  from fastapi.staticfiles import StaticFiles
11
 
12
+ from diffusers import AutoPipelineForImage2Image, AutoencoderTiny
13
  from compel import Compel
14
  import torch
15
+
16
  try:
17
  import intel_extension_for_pytorch as ipex
18
  except:
 
32
  WIDTH = 512
33
  HEIGHT = 512
34
  # disable tiny autoencoder for better quality speed tradeoff
35
+ USE_TINY_AUTOENCODER = True
36
 
37
  # check if MPS is available OSX only M1/M2/M3 chips
38
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
39
+ xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available()
40
+ device = torch.device(
41
+ "cuda" if torch.cuda.is_available() else "xpu" if xpu_available else "cpu"
42
+ )
43
  torch_device = device
44
 
45
  # change to torch.float16 to save GPU memory
 
56
  torch_dtype = torch.float32
57
 
58
  if SAFETY_CHECKER == "True":
59
+ pipe = AutoPipelineForImage2Image.from_pretrained(
60
  "SimianLuo/LCM_Dreamshaper_v7",
 
 
61
  )
62
  else:
63
+ pipe = AutoPipelineForImage2Image.from_pretrained(
64
  "SimianLuo/LCM_Dreamshaper_v7",
65
  safety_checker=None,
 
 
66
  )
67
 
68
  if USE_TINY_AUTOENCODER:
 
70
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
71
  )
72
  pipe.set_progress_bar_config(disable=True)
73
+ pipe.to(device=torch_device, dtype=torch_dtype).to(device)
74
  pipe.unet.to(memory_format=torch.channels_last)
75
 
76
  if psutil.virtual_memory().total < 64 * 1024**3:
 
97
  height: int = HEIGHT
98
 
99
 
100
+ def predict(
101
+ input_image: Image.Image, params: InputParams, prompt_embeds: torch.Tensor = None
102
+ ):
103
  generator = torch.manual_seed(params.seed)
104
  # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
105
  num_inference_steps = 3
 
112
  guidance_scale=params.guidance_scale,
113
  width=params.width,
114
  height=params.height,
115
+ original_inference_steps=50,
116
  output_type="pil",
117
  )
118
  nsfw_content_detected = (
 
182
  try:
183
  user_queue = user_queue_map[uid]
184
  queue = user_queue["queue"]
185
+
186
  async def generate():
187
  last_prompt: str = None
188
  prompt_embeds: torch.Tensor = None
app-txt2img.py CHANGED
@@ -12,6 +12,7 @@ from fastapi.staticfiles import StaticFiles
12
  from diffusers import DiffusionPipeline, AutoencoderTiny
13
  from compel import Compel
14
  import torch
 
15
  try:
16
  import intel_extension_for_pytorch as ipex
17
  except:
@@ -29,15 +30,17 @@ import psutil
29
  MAX_QUEUE_SIZE = int(os.environ.get("MAX_QUEUE_SIZE", 0))
30
  TIMEOUT = float(os.environ.get("TIMEOUT", 0))
31
  SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
32
- WIDTH = 512
33
- HEIGHT = 512
34
  # disable tiny autoencoder for better quality speed tradeoff
35
- USE_TINY_AUTOENCODER=True
36
 
37
  # check if MPS is available OSX only M1/M2/M3 chips
38
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
39
- xpu_available = hasattr(torch, 'xpu') and torch.xpu.is_available()
40
- device = torch.device("cuda" if torch.cuda.is_available() else "xpu" if xpu_available else "cpu")
 
 
41
  torch_device = device
42
  # change to torch.float16 to save GPU memory
43
  torch_dtype = torch.float32
@@ -55,22 +58,18 @@ if mps_available:
55
  if SAFETY_CHECKER == "True":
56
  pipe = DiffusionPipeline.from_pretrained(
57
  "SimianLuo/LCM_Dreamshaper_v7",
58
- custom_pipeline="latent_consistency_txt2img.py",
59
- custom_revision="main",
60
  )
61
  else:
62
  pipe = DiffusionPipeline.from_pretrained(
63
  "SimianLuo/LCM_Dreamshaper_v7",
64
  safety_checker=None,
65
- custom_pipeline="latent_consistency_txt2img.py",
66
- custom_revision="main",
67
  )
68
  if USE_TINY_AUTOENCODER:
69
  pipe.vae = AutoencoderTiny.from_pretrained(
70
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
71
  )
72
  pipe.set_progress_bar_config(disable=True)
73
- pipe.to(torch_device=torch_device, torch_dtype=torch_dtype).to(device)
74
  pipe.unet.to(memory_format=torch.channels_last)
75
 
76
  # check if computer has less than 64GB of RAM using sys or os
@@ -88,6 +87,7 @@ compel_proc = Compel(
88
  )
89
  user_queue_map = {}
90
 
 
91
  class InputParams(BaseModel):
92
  prompt: str
93
  seed: int = 2159232
@@ -95,6 +95,7 @@ class InputParams(BaseModel):
95
  width: int = WIDTH
96
  height: int = HEIGHT
97
 
 
98
  def predict(params: InputParams):
99
  generator = torch.manual_seed(params.seed)
100
  prompt_embeds = compel_proc(params.prompt)
@@ -107,7 +108,7 @@ def predict(params: InputParams):
107
  guidance_scale=params.guidance_scale,
108
  width=params.width,
109
  height=params.height,
110
- lcm_origin_steps=50,
111
  output_type="pil",
112
  )
113
  nsfw_content_detected = (
@@ -129,6 +130,7 @@ app.add_middleware(
129
  allow_headers=["*"],
130
  )
131
 
 
132
  @app.websocket("/ws")
133
  async def websocket_endpoint(websocket: WebSocket):
134
  await websocket.accept()
 
12
  from diffusers import DiffusionPipeline, AutoencoderTiny
13
  from compel import Compel
14
  import torch
15
+
16
  try:
17
  import intel_extension_for_pytorch as ipex
18
  except:
 
30
  MAX_QUEUE_SIZE = int(os.environ.get("MAX_QUEUE_SIZE", 0))
31
  TIMEOUT = float(os.environ.get("TIMEOUT", 0))
32
  SAFETY_CHECKER = os.environ.get("SAFETY_CHECKER", None)
33
+ WIDTH = 768
34
+ HEIGHT = 768
35
  # disable tiny autoencoder for better quality speed tradeoff
36
+ USE_TINY_AUTOENCODER = False
37
 
38
  # check if MPS is available OSX only M1/M2/M3 chips
39
  mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
40
+ xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available()
41
+ device = torch.device(
42
+ "cuda" if torch.cuda.is_available() else "xpu" if xpu_available else "cpu"
43
+ )
44
  torch_device = device
45
  # change to torch.float16 to save GPU memory
46
  torch_dtype = torch.float32
 
58
  if SAFETY_CHECKER == "True":
59
  pipe = DiffusionPipeline.from_pretrained(
60
  "SimianLuo/LCM_Dreamshaper_v7",
 
 
61
  )
62
  else:
63
  pipe = DiffusionPipeline.from_pretrained(
64
  "SimianLuo/LCM_Dreamshaper_v7",
65
  safety_checker=None,
 
 
66
  )
67
  if USE_TINY_AUTOENCODER:
68
  pipe.vae = AutoencoderTiny.from_pretrained(
69
  "madebyollin/taesd", torch_dtype=torch_dtype, use_safetensors=True
70
  )
71
  pipe.set_progress_bar_config(disable=True)
72
+ pipe.to(device=torch_device, dtype=torch_dtype).to(device)
73
  pipe.unet.to(memory_format=torch.channels_last)
74
 
75
  # check if computer has less than 64GB of RAM using sys or os
 
87
  )
88
  user_queue_map = {}
89
 
90
+
91
  class InputParams(BaseModel):
92
  prompt: str
93
  seed: int = 2159232
 
95
  width: int = WIDTH
96
  height: int = HEIGHT
97
 
98
+
99
  def predict(params: InputParams):
100
  generator = torch.manual_seed(params.seed)
101
  prompt_embeds = compel_proc(params.prompt)
 
108
  guidance_scale=params.guidance_scale,
109
  width=params.width,
110
  height=params.height,
111
+ original_inference_steps=50,
112
  output_type="pil",
113
  )
114
  nsfw_content_detected = (
 
130
  allow_headers=["*"],
131
  )
132
 
133
+
134
  @app.websocket("/ws")
135
  async def websocket_endpoint(websocket: WebSocket):
136
  await websocket.accept()
img2img/index.html CHANGED
@@ -257,7 +257,7 @@
257
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
258
  8.0</output>
259
  <label class="text-sm font-medium" for="strength">Strength</label>
260
- <input type="range" id="strength" name="strength" min="0.20" max="1" step="0.001" value="0.50"
261
  oninput="this.nextElementSibling.value = Number(this.value).toFixed(2)">
262
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
263
  0.5</output>
 
257
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
258
  8.0</output>
259
  <label class="text-sm font-medium" for="strength">Strength</label>
260
+ <input type="range" id="strength" name="strength" min="0.02" max="1" step="0.001" value="0.50"
261
  oninput="this.nextElementSibling.value = Number(this.value).toFixed(2)">
262
  <output class="text-xs w-[50px] text-center font-light px-1 py-1 border border-gray-700 rounded-md">
263
  0.5</output>
latent_consistency_img2img.py DELETED
@@ -1,934 +0,0 @@
1
- # Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
16
- # and https://github.com/hojonathanho/diffusion
17
-
18
- import math
19
- from dataclasses import dataclass
20
- from typing import Any, Dict, List, Optional, Tuple, Union
21
-
22
- import numpy as np
23
- import PIL.Image
24
- import torch
25
- from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
26
-
27
- from diffusers import (
28
- AutoencoderTiny,
29
- AutoencoderKL,
30
- ConfigMixin,
31
- DiffusionPipeline,
32
- SchedulerMixin,
33
- UNet2DConditionModel,
34
- logging,
35
- )
36
- from diffusers.configuration_utils import register_to_config
37
- from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
38
- from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
39
- from diffusers.pipelines.stable_diffusion.safety_checker import (
40
- StableDiffusionSafetyChecker,
41
- )
42
- from diffusers.utils import BaseOutput
43
- from diffusers.utils.torch_utils import randn_tensor
44
-
45
-
46
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
47
-
48
-
49
- class LatentConsistencyModelImg2ImgPipeline(DiffusionPipeline):
50
- _optional_components = ["scheduler"]
51
-
52
- def __init__(
53
- self,
54
- vae: AutoencoderKL,
55
- text_encoder: CLIPTextModel,
56
- tokenizer: CLIPTokenizer,
57
- unet: UNet2DConditionModel,
58
- scheduler: "LCMSchedulerWithTimestamp",
59
- safety_checker: StableDiffusionSafetyChecker,
60
- feature_extractor: CLIPImageProcessor,
61
- requires_safety_checker: bool = True,
62
- ):
63
- super().__init__()
64
-
65
- scheduler = (
66
- scheduler
67
- if scheduler is not None
68
- else LCMSchedulerWithTimestamp(
69
- beta_start=0.00085,
70
- beta_end=0.0120,
71
- beta_schedule="scaled_linear",
72
- prediction_type="epsilon",
73
- )
74
- )
75
-
76
- self.register_modules(
77
- vae=vae,
78
- text_encoder=text_encoder,
79
- tokenizer=tokenizer,
80
- unet=unet,
81
- scheduler=scheduler,
82
- safety_checker=safety_checker,
83
- feature_extractor=feature_extractor,
84
- )
85
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
86
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
87
-
88
- def _encode_prompt(
89
- self,
90
- prompt,
91
- device,
92
- num_images_per_prompt,
93
- prompt_embeds: None,
94
- ):
95
- r"""
96
- Encodes the prompt into text encoder hidden states.
97
- Args:
98
- prompt (`str` or `List[str]`, *optional*):
99
- prompt to be encoded
100
- device: (`torch.device`):
101
- torch device
102
- num_images_per_prompt (`int`):
103
- number of images that should be generated per prompt
104
- prompt_embeds (`torch.FloatTensor`, *optional*):
105
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
106
- provided, text embeddings will be generated from `prompt` input argument.
107
- """
108
-
109
- if prompt is not None and isinstance(prompt, str):
110
- pass
111
- elif prompt is not None and isinstance(prompt, list):
112
- len(prompt)
113
- else:
114
- prompt_embeds.shape[0]
115
-
116
- if prompt_embeds is None:
117
- text_inputs = self.tokenizer(
118
- prompt,
119
- padding="max_length",
120
- max_length=self.tokenizer.model_max_length,
121
- truncation=True,
122
- return_tensors="pt",
123
- )
124
- text_input_ids = text_inputs.input_ids
125
- untruncated_ids = self.tokenizer(
126
- prompt, padding="longest", return_tensors="pt"
127
- ).input_ids
128
-
129
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
130
- -1
131
- ] and not torch.equal(text_input_ids, untruncated_ids):
132
- removed_text = self.tokenizer.batch_decode(
133
- untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
134
- )
135
- logger.warning(
136
- "The following part of your input was truncated because CLIP can only handle sequences up to"
137
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
138
- )
139
-
140
- if (
141
- hasattr(self.text_encoder.config, "use_attention_mask")
142
- and self.text_encoder.config.use_attention_mask
143
- ):
144
- attention_mask = text_inputs.attention_mask.to(device)
145
- else:
146
- attention_mask = None
147
-
148
- prompt_embeds = self.text_encoder(
149
- text_input_ids.to(device),
150
- attention_mask=attention_mask,
151
- )
152
- prompt_embeds = prompt_embeds[0]
153
-
154
- if self.text_encoder is not None:
155
- prompt_embeds_dtype = self.text_encoder.dtype
156
- elif self.unet is not None:
157
- prompt_embeds_dtype = self.unet.dtype
158
- else:
159
- prompt_embeds_dtype = prompt_embeds.dtype
160
-
161
- prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
162
-
163
- bs_embed, seq_len, _ = prompt_embeds.shape
164
- # duplicate text embeddings for each generation per prompt, using mps friendly method
165
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
166
- prompt_embeds = prompt_embeds.view(
167
- bs_embed * num_images_per_prompt, seq_len, -1
168
- )
169
-
170
- # Don't need to get uncond prompt embedding because of LCM Guided Distillation
171
- return prompt_embeds
172
-
173
- def run_safety_checker(self, image, device, dtype):
174
- if self.safety_checker is None:
175
- has_nsfw_concept = None
176
- else:
177
- if torch.is_tensor(image):
178
- feature_extractor_input = self.image_processor.postprocess(
179
- image, output_type="pil"
180
- )
181
- else:
182
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
183
- safety_checker_input = self.feature_extractor(
184
- feature_extractor_input, return_tensors="pt"
185
- ).to(device)
186
- image, has_nsfw_concept = self.safety_checker(
187
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
188
- )
189
- return image, has_nsfw_concept
190
-
191
- def prepare_latents(
192
- self,
193
- image,
194
- timestep,
195
- batch_size,
196
- num_channels_latents,
197
- height,
198
- width,
199
- dtype,
200
- device,
201
- latents=None,
202
- generator=None,
203
- ):
204
- shape = (
205
- batch_size,
206
- num_channels_latents,
207
- height // self.vae_scale_factor,
208
- width // self.vae_scale_factor,
209
- )
210
-
211
- if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
212
- raise ValueError(
213
- f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
214
- )
215
-
216
- image = image.to(device=device, dtype=dtype)
217
-
218
- # batch_size = batch_size * num_images_per_prompt
219
- if image.shape[1] == 4:
220
- init_latents = image
221
-
222
- else:
223
- if isinstance(generator, list) and len(generator) != batch_size:
224
- raise ValueError(
225
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
226
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
227
- )
228
-
229
- elif isinstance(generator, list):
230
- if isinstance(self.vae, AutoencoderTiny):
231
- init_latents = [
232
- self.vae.encode(image[i : i + 1]).latents
233
- for i in range(batch_size)
234
- ]
235
- else:
236
- init_latents = [
237
- self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i])
238
- for i in range(batch_size)
239
- ]
240
- init_latents = torch.cat(init_latents, dim=0)
241
- else:
242
- if isinstance(self.vae, AutoencoderTiny):
243
- init_latents = self.vae.encode(image).latents
244
- else:
245
- init_latents = self.vae.encode(image).latent_dist.sample(generator)
246
-
247
- init_latents = self.vae.config.scaling_factor * init_latents
248
-
249
- if (
250
- batch_size > init_latents.shape[0]
251
- and batch_size % init_latents.shape[0] == 0
252
- ):
253
- # expand init_latents for batch_size
254
- (
255
- f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
256
- " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
257
- " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
258
- " your script to pass as many initial images as text prompts to suppress this warning."
259
- )
260
- # deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
261
- additional_image_per_prompt = batch_size // init_latents.shape[0]
262
- init_latents = torch.cat(
263
- [init_latents] * additional_image_per_prompt, dim=0
264
- )
265
- elif (
266
- batch_size > init_latents.shape[0]
267
- and batch_size % init_latents.shape[0] != 0
268
- ):
269
- raise ValueError(
270
- f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
271
- )
272
- else:
273
- init_latents = torch.cat([init_latents], dim=0)
274
-
275
- shape = init_latents.shape
276
- noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
277
-
278
- # get latents
279
- init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
280
- latents = init_latents
281
-
282
- if latents is None:
283
- latents = torch.randn(shape, dtype=dtype).to(device)
284
- else:
285
- latents = latents.to(device)
286
- # scale the initial noise by the standard deviation required by the scheduler
287
- latents = latents * self.scheduler.init_noise_sigma
288
- return latents
289
-
290
- def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
291
- """
292
- see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
293
- Args:
294
- timesteps: torch.Tensor: generate embedding vectors at these timesteps
295
- embedding_dim: int: dimension of the embeddings to generate
296
- dtype: data type of the generated embeddings
297
- Returns:
298
- embedding vectors with shape `(len(timesteps), embedding_dim)`
299
- """
300
- assert len(w.shape) == 1
301
- w = w * 1000.0
302
-
303
- half_dim = embedding_dim // 2
304
- emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
305
- emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
306
- emb = w.to(dtype)[:, None] * emb[None, :]
307
- emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
308
- if embedding_dim % 2 == 1: # zero pad
309
- emb = torch.nn.functional.pad(emb, (0, 1))
310
- assert emb.shape == (w.shape[0], embedding_dim)
311
- return emb
312
-
313
- def get_timesteps(self, num_inference_steps, strength, device):
314
- # get the original timestep using init_timestep
315
- init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
316
-
317
- t_start = max(num_inference_steps - init_timestep, 0)
318
- timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
319
-
320
- return timesteps, num_inference_steps - t_start
321
-
322
- @torch.no_grad()
323
- def __call__(
324
- self,
325
- prompt: Union[str, List[str]] = None,
326
- image: PipelineImageInput = None,
327
- strength: float = 0.8,
328
- height: Optional[int] = 768,
329
- width: Optional[int] = 768,
330
- guidance_scale: float = 7.5,
331
- num_images_per_prompt: Optional[int] = 1,
332
- latents: Optional[torch.FloatTensor] = None,
333
- generator: Optional[torch.Generator] = None,
334
- num_inference_steps: int = 4,
335
- lcm_origin_steps: int = 50,
336
- prompt_embeds: Optional[torch.FloatTensor] = None,
337
- output_type: Optional[str] = "pil",
338
- return_dict: bool = True,
339
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
340
- ):
341
- # 0. Default height and width to unet
342
- height = height or self.unet.config.sample_size * self.vae_scale_factor
343
- width = width or self.unet.config.sample_size * self.vae_scale_factor
344
-
345
- # 2. Define call parameters
346
- if prompt is not None and isinstance(prompt, str):
347
- batch_size = 1
348
- elif prompt is not None and isinstance(prompt, list):
349
- batch_size = len(prompt)
350
- else:
351
- batch_size = prompt_embeds.shape[0]
352
-
353
- device = self._execution_device
354
- # do_classifier_free_guidance = guidance_scale > 0.0 # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
355
-
356
- # 3. Encode input prompt
357
- prompt_embeds = self._encode_prompt(
358
- prompt,
359
- device,
360
- num_images_per_prompt,
361
- prompt_embeds=prompt_embeds,
362
- )
363
-
364
- # 3.5 encode image
365
- image = self.image_processor.preprocess(image)
366
-
367
- # 4. Prepare timesteps
368
- self.scheduler.set_timesteps(strength, num_inference_steps, lcm_origin_steps)
369
- # timesteps = self.scheduler.timesteps
370
- # timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, 1.0, device)
371
- timesteps = self.scheduler.timesteps
372
- latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
373
-
374
- print("timesteps: ", timesteps)
375
-
376
- # 5. Prepare latent variable
377
- num_channels_latents = self.unet.config.in_channels
378
- latents = self.prepare_latents(
379
- image,
380
- latent_timestep,
381
- batch_size * num_images_per_prompt,
382
- num_channels_latents,
383
- height,
384
- width,
385
- prompt_embeds.dtype,
386
- device,
387
- latents,
388
- generator
389
- )
390
- bs = batch_size * num_images_per_prompt
391
-
392
- # 6. Get Guidance Scale Embedding
393
- w = torch.tensor(guidance_scale).repeat(bs)
394
- w_embedding = self.get_w_embedding(w, embedding_dim=256).to(
395
- device=device, dtype=latents.dtype
396
- )
397
-
398
- # 7. LCM MultiStep Sampling Loop:
399
- with self.progress_bar(total=num_inference_steps) as progress_bar:
400
- for i, t in enumerate(timesteps):
401
- ts = torch.full((bs,), t, device=device, dtype=torch.long)
402
- latents = latents.to(prompt_embeds.dtype)
403
-
404
- # model prediction (v-prediction, eps, x)
405
- model_pred = self.unet(
406
- latents,
407
- ts,
408
- timestep_cond=w_embedding,
409
- encoder_hidden_states=prompt_embeds,
410
- cross_attention_kwargs=cross_attention_kwargs,
411
- return_dict=False,
412
- )[0]
413
-
414
- # compute the previous noisy sample x_t -> x_t-1
415
- latents, denoised = self.scheduler.step(
416
- model_pred, i, t, latents, return_dict=False
417
- )
418
-
419
- # # call the callback, if provided
420
- # if i == len(timesteps) - 1:
421
- progress_bar.update()
422
-
423
- denoised = denoised.to(prompt_embeds.dtype)
424
- if not output_type == "latent":
425
- image = self.vae.decode(
426
- denoised / self.vae.config.scaling_factor, return_dict=False
427
- )[0]
428
- image, has_nsfw_concept = self.run_safety_checker(
429
- image, device, prompt_embeds.dtype
430
- )
431
- else:
432
- image = denoised
433
- has_nsfw_concept = None
434
-
435
- if has_nsfw_concept is None:
436
- do_denormalize = [True] * image.shape[0]
437
- else:
438
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
439
-
440
- image = self.image_processor.postprocess(
441
- image, output_type=output_type, do_denormalize=do_denormalize
442
- )
443
-
444
- if not return_dict:
445
- return (image, has_nsfw_concept)
446
-
447
- return StableDiffusionPipelineOutput(
448
- images=image, nsfw_content_detected=has_nsfw_concept
449
- )
450
-
451
-
452
- @dataclass
453
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
454
- class LCMSchedulerOutput(BaseOutput):
455
- """
456
- Output class for the scheduler's `step` function output.
457
- Args:
458
- prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
459
- Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
460
- denoising loop.
461
- pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
462
- The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
463
- `pred_original_sample` can be used to preview progress or for guidance.
464
- """
465
-
466
- prev_sample: torch.FloatTensor
467
- denoised: Optional[torch.FloatTensor] = None
468
-
469
-
470
- # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
471
- def betas_for_alpha_bar(
472
- num_diffusion_timesteps,
473
- max_beta=0.999,
474
- alpha_transform_type="cosine",
475
- ):
476
- """
477
- Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
478
- (1-beta) over time from t = [0,1].
479
- Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
480
- to that part of the diffusion process.
481
- Args:
482
- num_diffusion_timesteps (`int`): the number of betas to produce.
483
- max_beta (`float`): the maximum beta to use; use values lower than 1 to
484
- prevent singularities.
485
- alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
486
- Choose from `cosine` or `exp`
487
- Returns:
488
- betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
489
- """
490
- if alpha_transform_type == "cosine":
491
-
492
- def alpha_bar_fn(t):
493
- return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
494
-
495
- elif alpha_transform_type == "exp":
496
-
497
- def alpha_bar_fn(t):
498
- return math.exp(t * -12.0)
499
-
500
- else:
501
- raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
502
-
503
- betas = []
504
- for i in range(num_diffusion_timesteps):
505
- t1 = i / num_diffusion_timesteps
506
- t2 = (i + 1) / num_diffusion_timesteps
507
- betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
508
- return torch.tensor(betas, dtype=torch.float32)
509
-
510
-
511
- def rescale_zero_terminal_snr(betas):
512
- """
513
- Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
514
- Args:
515
- betas (`torch.FloatTensor`):
516
- the betas that the scheduler is being initialized with.
517
- Returns:
518
- `torch.FloatTensor`: rescaled betas with zero terminal SNR
519
- """
520
- # Convert betas to alphas_bar_sqrt
521
- alphas = 1.0 - betas
522
- alphas_cumprod = torch.cumprod(alphas, dim=0)
523
- alphas_bar_sqrt = alphas_cumprod.sqrt()
524
-
525
- # Store old values.
526
- alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
527
- alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
528
-
529
- # Shift so the last timestep is zero.
530
- alphas_bar_sqrt -= alphas_bar_sqrt_T
531
-
532
- # Scale so the first timestep is back to the old value.
533
- alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
534
-
535
- # Convert alphas_bar_sqrt to betas
536
- alphas_bar = alphas_bar_sqrt**2 # Revert sqrt
537
- alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod
538
- alphas = torch.cat([alphas_bar[0:1], alphas])
539
- betas = 1 - alphas
540
-
541
- return betas
542
-
543
-
544
- class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
545
- """
546
- This class modifies LCMScheduler to add a timestamp argument to set_timesteps
547
-
548
-
549
- `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
550
- non-Markovian guidance.
551
- This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
552
- methods the library implements for all schedulers such as loading and saving.
553
- Args:
554
- num_train_timesteps (`int`, defaults to 1000):
555
- The number of diffusion steps to train the model.
556
- beta_start (`float`, defaults to 0.0001):
557
- The starting `beta` value of inference.
558
- beta_end (`float`, defaults to 0.02):
559
- The final `beta` value.
560
- beta_schedule (`str`, defaults to `"linear"`):
561
- The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
562
- `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
563
- trained_betas (`np.ndarray`, *optional*):
564
- Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
565
- clip_sample (`bool`, defaults to `True`):
566
- Clip the predicted sample for numerical stability.
567
- clip_sample_range (`float`, defaults to 1.0):
568
- The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
569
- set_alpha_to_one (`bool`, defaults to `True`):
570
- Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
571
- there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
572
- otherwise it uses the alpha value at step 0.
573
- steps_offset (`int`, defaults to 0):
574
- An offset added to the inference steps. You can use a combination of `offset=1` and
575
- `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
576
- Diffusion.
577
- prediction_type (`str`, defaults to `epsilon`, *optional*):
578
- Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
579
- `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
580
- Video](https://imagen.research.google/video/paper.pdf) paper).
581
- thresholding (`bool`, defaults to `False`):
582
- Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
583
- as Stable Diffusion.
584
- dynamic_thresholding_ratio (`float`, defaults to 0.995):
585
- The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
586
- sample_max_value (`float`, defaults to 1.0):
587
- The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
588
- timestep_spacing (`str`, defaults to `"leading"`):
589
- The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
590
- Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
591
- rescale_betas_zero_snr (`bool`, defaults to `False`):
592
- Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
593
- dark samples instead of limiting it to samples with medium brightness. Loosely related to
594
- [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
595
- """
596
-
597
- # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
598
- order = 1
599
-
600
- @register_to_config
601
- def __init__(
602
- self,
603
- num_train_timesteps: int = 1000,
604
- beta_start: float = 0.0001,
605
- beta_end: float = 0.02,
606
- beta_schedule: str = "linear",
607
- trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
608
- clip_sample: bool = True,
609
- set_alpha_to_one: bool = True,
610
- steps_offset: int = 0,
611
- prediction_type: str = "epsilon",
612
- thresholding: bool = False,
613
- dynamic_thresholding_ratio: float = 0.995,
614
- clip_sample_range: float = 1.0,
615
- sample_max_value: float = 1.0,
616
- timestep_spacing: str = "leading",
617
- rescale_betas_zero_snr: bool = False,
618
- ):
619
- if trained_betas is not None:
620
- self.betas = torch.tensor(trained_betas, dtype=torch.float32)
621
- elif beta_schedule == "linear":
622
- self.betas = torch.linspace(
623
- beta_start, beta_end, num_train_timesteps, dtype=torch.float32
624
- )
625
- elif beta_schedule == "scaled_linear":
626
- # this schedule is very specific to the latent diffusion model.
627
- self.betas = (
628
- torch.linspace(
629
- beta_start**0.5,
630
- beta_end**0.5,
631
- num_train_timesteps,
632
- dtype=torch.float32,
633
- )
634
- ** 2
635
- )
636
- elif beta_schedule == "squaredcos_cap_v2":
637
- # Glide cosine schedule
638
- self.betas = betas_for_alpha_bar(num_train_timesteps)
639
- else:
640
- raise NotImplementedError(
641
- f"{beta_schedule} does is not implemented for {self.__class__}"
642
- )
643
-
644
- # Rescale for zero SNR
645
- if rescale_betas_zero_snr:
646
- self.betas = rescale_zero_terminal_snr(self.betas)
647
-
648
- self.alphas = 1.0 - self.betas
649
- self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
650
-
651
- # At every step in ddim, we are looking into the previous alphas_cumprod
652
- # For the final step, there is no previous alphas_cumprod because we are already at 0
653
- # `set_alpha_to_one` decides whether we set this parameter simply to one or
654
- # whether we use the final alpha of the "non-previous" one.
655
- self.final_alpha_cumprod = (
656
- torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
657
- )
658
-
659
- # standard deviation of the initial noise distribution
660
- self.init_noise_sigma = 1.0
661
-
662
- # setable values
663
- self.num_inference_steps = None
664
- self.timesteps = torch.from_numpy(
665
- np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)
666
- )
667
-
668
- def scale_model_input(
669
- self, sample: torch.FloatTensor, timestep: Optional[int] = None
670
- ) -> torch.FloatTensor:
671
- """
672
- Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
673
- current timestep.
674
- Args:
675
- sample (`torch.FloatTensor`):
676
- The input sample.
677
- timestep (`int`, *optional*):
678
- The current timestep in the diffusion chain.
679
- Returns:
680
- `torch.FloatTensor`:
681
- A scaled input sample.
682
- """
683
- return sample
684
-
685
- def _get_variance(self, timestep, prev_timestep):
686
- alpha_prod_t = self.alphas_cumprod[timestep]
687
- alpha_prod_t_prev = (
688
- self.alphas_cumprod[prev_timestep]
689
- if prev_timestep >= 0
690
- else self.final_alpha_cumprod
691
- )
692
- beta_prod_t = 1 - alpha_prod_t
693
- beta_prod_t_prev = 1 - alpha_prod_t_prev
694
-
695
- variance = (beta_prod_t_prev / beta_prod_t) * (
696
- 1 - alpha_prod_t / alpha_prod_t_prev
697
- )
698
-
699
- return variance
700
-
701
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
702
- def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
703
- """
704
- "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
705
- prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
706
- s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
707
- pixels from saturation at each step. We find that dynamic thresholding results in significantly better
708
- photorealism as well as better image-text alignment, especially when using very large guidance weights."
709
- https://arxiv.org/abs/2205.11487
710
- """
711
- dtype = sample.dtype
712
- batch_size, channels, height, width = sample.shape
713
-
714
- if dtype not in (torch.float32, torch.float64):
715
- sample = (
716
- sample.float()
717
- ) # upcast for quantile calculation, and clamp not implemented for cpu half
718
-
719
- # Flatten sample for doing quantile calculation along each image
720
- sample = sample.reshape(batch_size, channels * height * width)
721
-
722
- abs_sample = sample.abs() # "a certain percentile absolute pixel value"
723
-
724
- s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
725
- s = torch.clamp(
726
- s, min=1, max=self.config.sample_max_value
727
- ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
728
-
729
- s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
730
- sample = (
731
- torch.clamp(sample, -s, s) / s
732
- ) # "we threshold xt0 to the range [-s, s] and then divide by s"
733
-
734
- sample = sample.reshape(batch_size, channels, height, width)
735
- sample = sample.to(dtype)
736
-
737
- return sample
738
-
739
- def set_timesteps(
740
- self,
741
- stength,
742
- num_inference_steps: int,
743
- lcm_origin_steps: int,
744
- device: Union[str, torch.device] = None,
745
- ):
746
- """
747
- Sets the discrete timesteps used for the diffusion chain (to be run before inference).
748
- Args:
749
- num_inference_steps (`int`):
750
- The number of diffusion steps used when generating samples with a pre-trained model.
751
- """
752
-
753
- if num_inference_steps > self.config.num_train_timesteps:
754
- raise ValueError(
755
- f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
756
- f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
757
- f" maximal {self.config.num_train_timesteps} timesteps."
758
- )
759
-
760
- self.num_inference_steps = num_inference_steps
761
-
762
- # LCM Timesteps Setting: # Linear Spacing
763
- c = self.config.num_train_timesteps // lcm_origin_steps
764
- lcm_origin_timesteps = (
765
- np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1
766
- ) # LCM Training Steps Schedule
767
- skipping_step = len(lcm_origin_timesteps) // num_inference_steps
768
- timesteps = lcm_origin_timesteps[::-skipping_step][
769
- :num_inference_steps
770
- ] # LCM Inference Steps Schedule
771
-
772
- self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
773
-
774
- def get_scalings_for_boundary_condition_discrete(self, t):
775
- self.sigma_data = 0.5 # Default: 0.5
776
-
777
- # By dividing 0.1: This is almost a delta function at t=0.
778
- c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
779
- c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
780
- return c_skip, c_out
781
-
782
- def step(
783
- self,
784
- model_output: torch.FloatTensor,
785
- timeindex: int,
786
- timestep: int,
787
- sample: torch.FloatTensor,
788
- eta: float = 0.0,
789
- use_clipped_model_output: bool = False,
790
- generator=None,
791
- variance_noise: Optional[torch.FloatTensor] = None,
792
- return_dict: bool = True,
793
- ) -> Union[LCMSchedulerOutput, Tuple]:
794
- """
795
- Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
796
- process from the learned model outputs (most often the predicted noise).
797
- Args:
798
- model_output (`torch.FloatTensor`):
799
- The direct output from learned diffusion model.
800
- timestep (`float`):
801
- The current discrete timestep in the diffusion chain.
802
- sample (`torch.FloatTensor`):
803
- A current instance of a sample created by the diffusion process.
804
- eta (`float`):
805
- The weight of noise for added noise in diffusion step.
806
- use_clipped_model_output (`bool`, defaults to `False`):
807
- If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
808
- because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
809
- clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
810
- `use_clipped_model_output` has no effect.
811
- generator (`torch.Generator`, *optional*):
812
- A random number generator.
813
- variance_noise (`torch.FloatTensor`):
814
- Alternative to generating noise with `generator` by directly providing the noise for the variance
815
- itself. Useful for methods such as [`CycleDiffusion`].
816
- return_dict (`bool`, *optional*, defaults to `True`):
817
- Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
818
- Returns:
819
- [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
820
- If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
821
- tuple is returned where the first element is the sample tensor.
822
- """
823
- if self.num_inference_steps is None:
824
- raise ValueError(
825
- "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
826
- )
827
-
828
- # 1. get previous step value
829
- prev_timeindex = timeindex + 1
830
- if prev_timeindex < len(self.timesteps):
831
- prev_timestep = self.timesteps[prev_timeindex]
832
- else:
833
- prev_timestep = timestep
834
-
835
- # 2. compute alphas, betas
836
- alpha_prod_t = self.alphas_cumprod[timestep]
837
- alpha_prod_t_prev = (
838
- self.alphas_cumprod[prev_timestep]
839
- if prev_timestep >= 0
840
- else self.final_alpha_cumprod
841
- )
842
-
843
- beta_prod_t = 1 - alpha_prod_t
844
- beta_prod_t_prev = 1 - alpha_prod_t_prev
845
-
846
- # 3. Get scalings for boundary conditions
847
- c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
848
-
849
- # 4. Different Parameterization:
850
- parameterization = self.config.prediction_type
851
-
852
- if parameterization == "epsilon": # noise-prediction
853
- pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
854
-
855
- elif parameterization == "sample": # x-prediction
856
- pred_x0 = model_output
857
-
858
- elif parameterization == "v_prediction": # v-prediction
859
- pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
860
-
861
- # 4. Denoise model output using boundary conditions
862
- denoised = c_out * pred_x0 + c_skip * sample
863
-
864
- # 5. Sample z ~ N(0, I), For MultiStep Inference
865
- # Noise is not used for one-step sampling.
866
- if len(self.timesteps) > 1:
867
- noise = torch.randn(model_output.shape).to(model_output.device)
868
- prev_sample = (
869
- alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
870
- )
871
- else:
872
- prev_sample = denoised
873
-
874
- if not return_dict:
875
- return (prev_sample, denoised)
876
-
877
- return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
878
-
879
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
880
- def add_noise(
881
- self,
882
- original_samples: torch.FloatTensor,
883
- noise: torch.FloatTensor,
884
- timesteps: torch.IntTensor,
885
- ) -> torch.FloatTensor:
886
- # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
887
- alphas_cumprod = self.alphas_cumprod.to(
888
- device=original_samples.device, dtype=original_samples.dtype
889
- )
890
- timesteps = timesteps.to(original_samples.device)
891
-
892
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
893
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
894
- while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
895
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
896
-
897
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
898
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
899
- while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
900
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
901
-
902
- noisy_samples = (
903
- sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
904
- )
905
- return noisy_samples
906
-
907
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
908
- def get_velocity(
909
- self,
910
- sample: torch.FloatTensor,
911
- noise: torch.FloatTensor,
912
- timesteps: torch.IntTensor,
913
- ) -> torch.FloatTensor:
914
- # Make sure alphas_cumprod and timestep have same device and dtype as sample
915
- alphas_cumprod = self.alphas_cumprod.to(
916
- device=sample.device, dtype=sample.dtype
917
- )
918
- timesteps = timesteps.to(sample.device)
919
-
920
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
921
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
922
- while len(sqrt_alpha_prod.shape) < len(sample.shape):
923
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
924
-
925
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
926
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
927
- while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
928
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
929
-
930
- velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
931
- return velocity
932
-
933
- def __len__(self):
934
- return self.config.num_train_timesteps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
latent_consistency_txt2img.py DELETED
@@ -1,836 +0,0 @@
1
- # Copyright 2023 Stanford University Team and The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- # DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
16
- # and https://github.com/hojonathanho/diffusion
17
-
18
- import math
19
- from dataclasses import dataclass
20
- from typing import Any, Dict, List, Optional, Tuple, Union
21
-
22
- import numpy as np
23
- import torch
24
- from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
25
-
26
- from diffusers import (
27
- AutoencoderKL,
28
- ConfigMixin,
29
- DiffusionPipeline,
30
- SchedulerMixin,
31
- UNet2DConditionModel,
32
- logging,
33
- )
34
- from diffusers.configuration_utils import register_to_config
35
- from diffusers.image_processor import VaeImageProcessor
36
- from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
37
- from diffusers.pipelines.stable_diffusion.safety_checker import (
38
- StableDiffusionSafetyChecker,
39
- )
40
- from diffusers.utils import BaseOutput
41
-
42
-
43
- logger = logging.get_logger(__name__) # pylint: disable=invalid-name
44
-
45
-
46
- class LatentConsistencyModelPipeline(DiffusionPipeline):
47
- _optional_components = ["scheduler"]
48
-
49
- def __init__(
50
- self,
51
- vae: AutoencoderKL,
52
- text_encoder: CLIPTextModel,
53
- tokenizer: CLIPTokenizer,
54
- unet: UNet2DConditionModel,
55
- scheduler: "LCMScheduler",
56
- safety_checker: StableDiffusionSafetyChecker,
57
- feature_extractor: CLIPImageProcessor,
58
- requires_safety_checker: bool = True,
59
- ):
60
- super().__init__()
61
-
62
- scheduler = (
63
- scheduler
64
- if scheduler is not None
65
- else LCMScheduler(
66
- beta_start=0.00085,
67
- beta_end=0.0120,
68
- beta_schedule="scaled_linear",
69
- prediction_type="epsilon",
70
- )
71
- )
72
-
73
- self.register_modules(
74
- vae=vae,
75
- text_encoder=text_encoder,
76
- tokenizer=tokenizer,
77
- unet=unet,
78
- scheduler=scheduler,
79
- safety_checker=safety_checker,
80
- feature_extractor=feature_extractor,
81
- )
82
- self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
83
- self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
84
-
85
- def _encode_prompt(
86
- self,
87
- prompt,
88
- device,
89
- num_images_per_prompt,
90
- prompt_embeds: None,
91
- ):
92
- r"""
93
- Encodes the prompt into text encoder hidden states.
94
- Args:
95
- prompt (`str` or `List[str]`, *optional*):
96
- prompt to be encoded
97
- device: (`torch.device`):
98
- torch device
99
- num_images_per_prompt (`int`):
100
- number of images that should be generated per prompt
101
- prompt_embeds (`torch.FloatTensor`, *optional*):
102
- Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
103
- provided, text embeddings will be generated from `prompt` input argument.
104
- """
105
-
106
- if prompt is not None and isinstance(prompt, str):
107
- pass
108
- elif prompt is not None and isinstance(prompt, list):
109
- len(prompt)
110
- else:
111
- prompt_embeds.shape[0]
112
-
113
- if prompt_embeds is None:
114
- text_inputs = self.tokenizer(
115
- prompt,
116
- padding="max_length",
117
- max_length=self.tokenizer.model_max_length,
118
- truncation=True,
119
- return_tensors="pt",
120
- )
121
- text_input_ids = text_inputs.input_ids
122
- untruncated_ids = self.tokenizer(
123
- prompt, padding="longest", return_tensors="pt"
124
- ).input_ids
125
-
126
- if untruncated_ids.shape[-1] >= text_input_ids.shape[
127
- -1
128
- ] and not torch.equal(text_input_ids, untruncated_ids):
129
- removed_text = self.tokenizer.batch_decode(
130
- untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
131
- )
132
- logger.warning(
133
- "The following part of your input was truncated because CLIP can only handle sequences up to"
134
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
135
- )
136
-
137
- if (
138
- hasattr(self.text_encoder.config, "use_attention_mask")
139
- and self.text_encoder.config.use_attention_mask
140
- ):
141
- attention_mask = text_inputs.attention_mask.to(device)
142
- else:
143
- attention_mask = None
144
-
145
- prompt_embeds = self.text_encoder(
146
- text_input_ids.to(device),
147
- attention_mask=attention_mask,
148
- )
149
- prompt_embeds = prompt_embeds[0]
150
-
151
- if self.text_encoder is not None:
152
- prompt_embeds_dtype = self.text_encoder.dtype
153
- elif self.unet is not None:
154
- prompt_embeds_dtype = self.unet.dtype
155
- else:
156
- prompt_embeds_dtype = prompt_embeds.dtype
157
-
158
- prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
159
-
160
- bs_embed, seq_len, _ = prompt_embeds.shape
161
- # duplicate text embeddings for each generation per prompt, using mps friendly method
162
- prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
163
- prompt_embeds = prompt_embeds.view(
164
- bs_embed * num_images_per_prompt, seq_len, -1
165
- )
166
-
167
- # Don't need to get uncond prompt embedding because of LCM Guided Distillation
168
- return prompt_embeds
169
-
170
- def run_safety_checker(self, image, device, dtype):
171
- if self.safety_checker is None:
172
- has_nsfw_concept = None
173
- else:
174
- if torch.is_tensor(image):
175
- feature_extractor_input = self.image_processor.postprocess(
176
- image, output_type="pil"
177
- )
178
- else:
179
- feature_extractor_input = self.image_processor.numpy_to_pil(image)
180
- safety_checker_input = self.feature_extractor(
181
- feature_extractor_input, return_tensors="pt"
182
- ).to(device)
183
- image, has_nsfw_concept = self.safety_checker(
184
- images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
185
- )
186
- return image, has_nsfw_concept
187
-
188
- def prepare_latents(
189
- self,
190
- batch_size,
191
- num_channels_latents,
192
- height,
193
- width,
194
- dtype,
195
- device,
196
- latents=None,
197
- generator=None,
198
- ):
199
- shape = (
200
- batch_size,
201
- num_channels_latents,
202
- height // self.vae_scale_factor,
203
- width // self.vae_scale_factor,
204
- )
205
- if generator is None:
206
- generator = torch.Generator()
207
- generator.manual_seed(torch.randint(0, 2 ** 32, (1,)).item())
208
-
209
- if latents is None:
210
- latents = torch.randn(shape, dtype=dtype, generator=generator).to(device)
211
- else:
212
- latents = latents.to(device)
213
- # scale the initial noise by the standard deviation required by the scheduler
214
- latents = latents * self.scheduler.init_noise_sigma
215
- return latents
216
-
217
- def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32):
218
- """
219
- see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
220
- Args:
221
- timesteps: torch.Tensor: generate embedding vectors at these timesteps
222
- embedding_dim: int: dimension of the embeddings to generate
223
- dtype: data type of the generated embeddings
224
- Returns:
225
- embedding vectors with shape `(len(timesteps), embedding_dim)`
226
- """
227
- assert len(w.shape) == 1
228
- w = w * 1000.0
229
-
230
- half_dim = embedding_dim // 2
231
- emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
232
- emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
233
- emb = w.to(dtype)[:, None] * emb[None, :]
234
- emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
235
- if embedding_dim % 2 == 1: # zero pad
236
- emb = torch.nn.functional.pad(emb, (0, 1))
237
- assert emb.shape == (w.shape[0], embedding_dim)
238
- return emb
239
-
240
- @torch.no_grad()
241
- def __call__(
242
- self,
243
- prompt: Union[str, List[str]] = None,
244
- height: Optional[int] = 768,
245
- width: Optional[int] = 768,
246
- guidance_scale: float = 7.5,
247
- num_images_per_prompt: Optional[int] = 1,
248
- latents: Optional[torch.FloatTensor] = None,
249
- generator: Optional[torch.Generator] = None,
250
- num_inference_steps: int = 4,
251
- lcm_origin_steps: int = 50,
252
- prompt_embeds: Optional[torch.FloatTensor] = None,
253
- output_type: Optional[str] = "pil",
254
- return_dict: bool = True,
255
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
256
- ):
257
- # 0. Default height and width to unet
258
- height = height or self.unet.config.sample_size * self.vae_scale_factor
259
- width = width or self.unet.config.sample_size * self.vae_scale_factor
260
-
261
- # 2. Define call parameters
262
- if prompt is not None and isinstance(prompt, str):
263
- batch_size = 1
264
- elif prompt is not None and isinstance(prompt, list):
265
- batch_size = len(prompt)
266
- else:
267
- batch_size = prompt_embeds.shape[0]
268
-
269
- device = self._execution_device
270
- # do_classifier_free_guidance = guidance_scale > 0.0 # In LCM Implementation: cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond) , (cfg_scale > 0.0 using CFG)
271
-
272
- # 3. Encode input prompt
273
- prompt_embeds = self._encode_prompt(
274
- prompt,
275
- device,
276
- num_images_per_prompt,
277
- prompt_embeds=prompt_embeds,
278
- )
279
-
280
- # 4. Prepare timesteps
281
- self.scheduler.set_timesteps(num_inference_steps, lcm_origin_steps)
282
- timesteps = self.scheduler.timesteps
283
-
284
- # 5. Prepare latent variable
285
- num_channels_latents = self.unet.config.in_channels
286
- latents = self.prepare_latents(
287
- batch_size * num_images_per_prompt,
288
- num_channels_latents,
289
- height,
290
- width,
291
- prompt_embeds.dtype,
292
- device,
293
- latents,
294
- generator
295
- )
296
- bs = batch_size * num_images_per_prompt
297
-
298
- # 6. Get Guidance Scale Embedding
299
- w = torch.tensor(guidance_scale).repeat(bs)
300
- w_embedding = self.get_w_embedding(w, embedding_dim=256).to(
301
- device=device, dtype=latents.dtype
302
- )
303
-
304
- # 7. LCM MultiStep Sampling Loop:
305
- with self.progress_bar(total=num_inference_steps) as progress_bar:
306
- for i, t in enumerate(timesteps):
307
- ts = torch.full((bs,), t, device=device, dtype=torch.long)
308
- latents = latents.to(prompt_embeds.dtype)
309
-
310
- # model prediction (v-prediction, eps, x)
311
- model_pred = self.unet(
312
- latents,
313
- ts,
314
- timestep_cond=w_embedding,
315
- encoder_hidden_states=prompt_embeds,
316
- cross_attention_kwargs=cross_attention_kwargs,
317
- return_dict=False,
318
- )[0]
319
-
320
- # compute the previous noisy sample x_t -> x_t-1
321
- latents, denoised = self.scheduler.step(
322
- model_pred, i, t, latents, return_dict=False
323
- )
324
-
325
- # # call the callback, if provided
326
- # if i == len(timesteps) - 1:
327
- progress_bar.update()
328
-
329
- denoised = denoised.to(prompt_embeds.dtype)
330
- if not output_type == "latent":
331
- image = self.vae.decode(
332
- denoised / self.vae.config.scaling_factor, return_dict=False
333
- )[0]
334
- image, has_nsfw_concept = self.run_safety_checker(
335
- image, device, prompt_embeds.dtype
336
- )
337
- else:
338
- image = denoised
339
- has_nsfw_concept = None
340
-
341
- if has_nsfw_concept is None:
342
- do_denormalize = [True] * image.shape[0]
343
- else:
344
- do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
345
-
346
- image = self.image_processor.postprocess(
347
- image, output_type=output_type, do_denormalize=do_denormalize
348
- )
349
-
350
- if not return_dict:
351
- return (image, has_nsfw_concept)
352
-
353
- return StableDiffusionPipelineOutput(
354
- images=image, nsfw_content_detected=has_nsfw_concept
355
- )
356
-
357
-
358
- @dataclass
359
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
360
- class LCMSchedulerOutput(BaseOutput):
361
- """
362
- Output class for the scheduler's `step` function output.
363
- Args:
364
- prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
365
- Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
366
- denoising loop.
367
- pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
368
- The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
369
- `pred_original_sample` can be used to preview progress or for guidance.
370
- """
371
-
372
- prev_sample: torch.FloatTensor
373
- denoised: Optional[torch.FloatTensor] = None
374
-
375
-
376
- # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
377
- def betas_for_alpha_bar(
378
- num_diffusion_timesteps,
379
- max_beta=0.999,
380
- alpha_transform_type="cosine",
381
- ):
382
- """
383
- Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
384
- (1-beta) over time from t = [0,1].
385
- Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
386
- to that part of the diffusion process.
387
- Args:
388
- num_diffusion_timesteps (`int`): the number of betas to produce.
389
- max_beta (`float`): the maximum beta to use; use values lower than 1 to
390
- prevent singularities.
391
- alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
392
- Choose from `cosine` or `exp`
393
- Returns:
394
- betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
395
- """
396
- if alpha_transform_type == "cosine":
397
-
398
- def alpha_bar_fn(t):
399
- return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
400
-
401
- elif alpha_transform_type == "exp":
402
-
403
- def alpha_bar_fn(t):
404
- return math.exp(t * -12.0)
405
-
406
- else:
407
- raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
408
-
409
- betas = []
410
- for i in range(num_diffusion_timesteps):
411
- t1 = i / num_diffusion_timesteps
412
- t2 = (i + 1) / num_diffusion_timesteps
413
- betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
414
- return torch.tensor(betas, dtype=torch.float32)
415
-
416
-
417
- def rescale_zero_terminal_snr(betas):
418
- """
419
- Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
420
- Args:
421
- betas (`torch.FloatTensor`):
422
- the betas that the scheduler is being initialized with.
423
- Returns:
424
- `torch.FloatTensor`: rescaled betas with zero terminal SNR
425
- """
426
- # Convert betas to alphas_bar_sqrt
427
- alphas = 1.0 - betas
428
- alphas_cumprod = torch.cumprod(alphas, dim=0)
429
- alphas_bar_sqrt = alphas_cumprod.sqrt()
430
-
431
- # Store old values.
432
- alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
433
- alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
434
-
435
- # Shift so the last timestep is zero.
436
- alphas_bar_sqrt -= alphas_bar_sqrt_T
437
-
438
- # Scale so the first timestep is back to the old value.
439
- alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
440
-
441
- # Convert alphas_bar_sqrt to betas
442
- alphas_bar = alphas_bar_sqrt**2 # Revert sqrt
443
- alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod
444
- alphas = torch.cat([alphas_bar[0:1], alphas])
445
- betas = 1 - alphas
446
-
447
- return betas
448
-
449
-
450
- class LCMScheduler(SchedulerMixin, ConfigMixin):
451
- """
452
- `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
453
- non-Markovian guidance.
454
- This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
455
- methods the library implements for all schedulers such as loading and saving.
456
- Args:
457
- num_train_timesteps (`int`, defaults to 1000):
458
- The number of diffusion steps to train the model.
459
- beta_start (`float`, defaults to 0.0001):
460
- The starting `beta` value of inference.
461
- beta_end (`float`, defaults to 0.02):
462
- The final `beta` value.
463
- beta_schedule (`str`, defaults to `"linear"`):
464
- The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
465
- `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
466
- trained_betas (`np.ndarray`, *optional*):
467
- Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
468
- clip_sample (`bool`, defaults to `True`):
469
- Clip the predicted sample for numerical stability.
470
- clip_sample_range (`float`, defaults to 1.0):
471
- The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
472
- set_alpha_to_one (`bool`, defaults to `True`):
473
- Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
474
- there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
475
- otherwise it uses the alpha value at step 0.
476
- steps_offset (`int`, defaults to 0):
477
- An offset added to the inference steps. You can use a combination of `offset=1` and
478
- `set_alpha_to_one=False` to make the last step use step 0 for the previous alpha product like in Stable
479
- Diffusion.
480
- prediction_type (`str`, defaults to `epsilon`, *optional*):
481
- Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
482
- `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
483
- Video](https://imagen.research.google/video/paper.pdf) paper).
484
- thresholding (`bool`, defaults to `False`):
485
- Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
486
- as Stable Diffusion.
487
- dynamic_thresholding_ratio (`float`, defaults to 0.995):
488
- The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
489
- sample_max_value (`float`, defaults to 1.0):
490
- The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
491
- timestep_spacing (`str`, defaults to `"leading"`):
492
- The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
493
- Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
494
- rescale_betas_zero_snr (`bool`, defaults to `False`):
495
- Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
496
- dark samples instead of limiting it to samples with medium brightness. Loosely related to
497
- [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
498
- """
499
-
500
- # _compatibles = [e.name for e in KarrasDiffusionSchedulers]
501
- order = 1
502
-
503
- @register_to_config
504
- def __init__(
505
- self,
506
- num_train_timesteps: int = 1000,
507
- beta_start: float = 0.0001,
508
- beta_end: float = 0.02,
509
- beta_schedule: str = "linear",
510
- trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
511
- clip_sample: bool = True,
512
- set_alpha_to_one: bool = True,
513
- steps_offset: int = 0,
514
- prediction_type: str = "epsilon",
515
- thresholding: bool = False,
516
- dynamic_thresholding_ratio: float = 0.995,
517
- clip_sample_range: float = 1.0,
518
- sample_max_value: float = 1.0,
519
- timestep_spacing: str = "leading",
520
- rescale_betas_zero_snr: bool = False,
521
- ):
522
- if trained_betas is not None:
523
- self.betas = torch.tensor(trained_betas, dtype=torch.float32)
524
- elif beta_schedule == "linear":
525
- self.betas = torch.linspace(
526
- beta_start, beta_end, num_train_timesteps, dtype=torch.float32
527
- )
528
- elif beta_schedule == "scaled_linear":
529
- # this schedule is very specific to the latent diffusion model.
530
- self.betas = (
531
- torch.linspace(
532
- beta_start**0.5,
533
- beta_end**0.5,
534
- num_train_timesteps,
535
- dtype=torch.float32,
536
- )
537
- ** 2
538
- )
539
- elif beta_schedule == "squaredcos_cap_v2":
540
- # Glide cosine schedule
541
- self.betas = betas_for_alpha_bar(num_train_timesteps)
542
- else:
543
- raise NotImplementedError(
544
- f"{beta_schedule} does is not implemented for {self.__class__}"
545
- )
546
-
547
- # Rescale for zero SNR
548
- if rescale_betas_zero_snr:
549
- self.betas = rescale_zero_terminal_snr(self.betas)
550
-
551
- self.alphas = 1.0 - self.betas
552
- self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
553
-
554
- # At every step in ddim, we are looking into the previous alphas_cumprod
555
- # For the final step, there is no previous alphas_cumprod because we are already at 0
556
- # `set_alpha_to_one` decides whether we set this parameter simply to one or
557
- # whether we use the final alpha of the "non-previous" one.
558
- self.final_alpha_cumprod = (
559
- torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
560
- )
561
-
562
- # standard deviation of the initial noise distribution
563
- self.init_noise_sigma = 1.0
564
-
565
- # setable values
566
- self.num_inference_steps = None
567
- self.timesteps = torch.from_numpy(
568
- np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)
569
- )
570
-
571
- def scale_model_input(
572
- self, sample: torch.FloatTensor, timestep: Optional[int] = None
573
- ) -> torch.FloatTensor:
574
- """
575
- Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
576
- current timestep.
577
- Args:
578
- sample (`torch.FloatTensor`):
579
- The input sample.
580
- timestep (`int`, *optional*):
581
- The current timestep in the diffusion chain.
582
- Returns:
583
- `torch.FloatTensor`:
584
- A scaled input sample.
585
- """
586
- return sample
587
-
588
- def _get_variance(self, timestep, prev_timestep):
589
- alpha_prod_t = self.alphas_cumprod[timestep]
590
- alpha_prod_t_prev = (
591
- self.alphas_cumprod[prev_timestep]
592
- if prev_timestep >= 0
593
- else self.final_alpha_cumprod
594
- )
595
- beta_prod_t = 1 - alpha_prod_t
596
- beta_prod_t_prev = 1 - alpha_prod_t_prev
597
-
598
- variance = (beta_prod_t_prev / beta_prod_t) * (
599
- 1 - alpha_prod_t / alpha_prod_t_prev
600
- )
601
-
602
- return variance
603
-
604
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
605
- def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
606
- """
607
- "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
608
- prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
609
- s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
610
- pixels from saturation at each step. We find that dynamic thresholding results in significantly better
611
- photorealism as well as better image-text alignment, especially when using very large guidance weights."
612
- https://arxiv.org/abs/2205.11487
613
- """
614
- dtype = sample.dtype
615
- batch_size, channels, height, width = sample.shape
616
-
617
- if dtype not in (torch.float32, torch.float64):
618
- sample = (
619
- sample.float()
620
- ) # upcast for quantile calculation, and clamp not implemented for cpu half
621
-
622
- # Flatten sample for doing quantile calculation along each image
623
- sample = sample.reshape(batch_size, channels * height * width)
624
-
625
- abs_sample = sample.abs() # "a certain percentile absolute pixel value"
626
-
627
- s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
628
- s = torch.clamp(
629
- s, min=1, max=self.config.sample_max_value
630
- ) # When clamped to min=1, equivalent to standard clipping to [-1, 1]
631
-
632
- s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0
633
- sample = (
634
- torch.clamp(sample, -s, s) / s
635
- ) # "we threshold xt0 to the range [-s, s] and then divide by s"
636
-
637
- sample = sample.reshape(batch_size, channels, height, width)
638
- sample = sample.to(dtype)
639
-
640
- return sample
641
-
642
- def set_timesteps(
643
- self,
644
- num_inference_steps: int,
645
- lcm_origin_steps: int,
646
- device: Union[str, torch.device] = None,
647
- ):
648
- """
649
- Sets the discrete timesteps used for the diffusion chain (to be run before inference).
650
- Args:
651
- num_inference_steps (`int`):
652
- The number of diffusion steps used when generating samples with a pre-trained model.
653
- """
654
-
655
- if num_inference_steps > self.config.num_train_timesteps:
656
- raise ValueError(
657
- f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
658
- f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
659
- f" maximal {self.config.num_train_timesteps} timesteps."
660
- )
661
-
662
- self.num_inference_steps = num_inference_steps
663
-
664
- # LCM Timesteps Setting: # Linear Spacing
665
- c = self.config.num_train_timesteps // lcm_origin_steps
666
- lcm_origin_timesteps = (
667
- np.asarray(list(range(1, lcm_origin_steps + 1))) * c - 1
668
- ) # LCM Training Steps Schedule
669
- skipping_step = len(lcm_origin_timesteps) // num_inference_steps
670
- timesteps = lcm_origin_timesteps[::-skipping_step][
671
- :num_inference_steps
672
- ] # LCM Inference Steps Schedule
673
-
674
- self.timesteps = torch.from_numpy(timesteps.copy()).to(device)
675
-
676
- def get_scalings_for_boundary_condition_discrete(self, t):
677
- self.sigma_data = 0.5 # Default: 0.5
678
-
679
- # By dividing 0.1: This is almost a delta function at t=0.
680
- c_skip = self.sigma_data**2 / ((t / 0.1) ** 2 + self.sigma_data**2)
681
- c_out = (t / 0.1) / ((t / 0.1) ** 2 + self.sigma_data**2) ** 0.5
682
- return c_skip, c_out
683
-
684
- def step(
685
- self,
686
- model_output: torch.FloatTensor,
687
- timeindex: int,
688
- timestep: int,
689
- sample: torch.FloatTensor,
690
- eta: float = 0.0,
691
- use_clipped_model_output: bool = False,
692
- generator=None,
693
- variance_noise: Optional[torch.FloatTensor] = None,
694
- return_dict: bool = True,
695
- ) -> Union[LCMSchedulerOutput, Tuple]:
696
- """
697
- Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
698
- process from the learned model outputs (most often the predicted noise).
699
- Args:
700
- model_output (`torch.FloatTensor`):
701
- The direct output from learned diffusion model.
702
- timestep (`float`):
703
- The current discrete timestep in the diffusion chain.
704
- sample (`torch.FloatTensor`):
705
- A current instance of a sample created by the diffusion process.
706
- eta (`float`):
707
- The weight of noise for added noise in diffusion step.
708
- use_clipped_model_output (`bool`, defaults to `False`):
709
- If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
710
- because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
711
- clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
712
- `use_clipped_model_output` has no effect.
713
- generator (`torch.Generator`, *optional*):
714
- A random number generator.
715
- variance_noise (`torch.FloatTensor`):
716
- Alternative to generating noise with `generator` by directly providing the noise for the variance
717
- itself. Useful for methods such as [`CycleDiffusion`].
718
- return_dict (`bool`, *optional*, defaults to `True`):
719
- Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
720
- Returns:
721
- [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
722
- If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
723
- tuple is returned where the first element is the sample tensor.
724
- """
725
- if self.num_inference_steps is None:
726
- raise ValueError(
727
- "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
728
- )
729
-
730
- # 1. get previous step value
731
- prev_timeindex = timeindex + 1
732
- if prev_timeindex < len(self.timesteps):
733
- prev_timestep = self.timesteps[prev_timeindex]
734
- else:
735
- prev_timestep = timestep
736
-
737
- # 2. compute alphas, betas
738
- alpha_prod_t = self.alphas_cumprod[timestep]
739
- alpha_prod_t_prev = (
740
- self.alphas_cumprod[prev_timestep]
741
- if prev_timestep >= 0
742
- else self.final_alpha_cumprod
743
- )
744
-
745
- beta_prod_t = 1 - alpha_prod_t
746
- beta_prod_t_prev = 1 - alpha_prod_t_prev
747
-
748
- # 3. Get scalings for boundary conditions
749
- c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
750
-
751
- # 4. Different Parameterization:
752
- parameterization = self.config.prediction_type
753
-
754
- if parameterization == "epsilon": # noise-prediction
755
- pred_x0 = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
756
-
757
- elif parameterization == "sample": # x-prediction
758
- pred_x0 = model_output
759
-
760
- elif parameterization == "v_prediction": # v-prediction
761
- pred_x0 = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
762
-
763
- # 4. Denoise model output using boundary conditions
764
- denoised = c_out * pred_x0 + c_skip * sample
765
-
766
- # 5. Sample z ~ N(0, I), For MultiStep Inference
767
- # Noise is not used for one-step sampling.
768
- if len(self.timesteps) > 1:
769
- noise = torch.randn(model_output.shape).to(model_output.device)
770
- prev_sample = (
771
- alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
772
- )
773
- else:
774
- prev_sample = denoised
775
-
776
- if not return_dict:
777
- return (prev_sample, denoised)
778
-
779
- return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
780
-
781
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
782
- def add_noise(
783
- self,
784
- original_samples: torch.FloatTensor,
785
- noise: torch.FloatTensor,
786
- timesteps: torch.IntTensor,
787
- ) -> torch.FloatTensor:
788
- # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
789
- alphas_cumprod = self.alphas_cumprod.to(
790
- device=original_samples.device, dtype=original_samples.dtype
791
- )
792
- timesteps = timesteps.to(original_samples.device)
793
-
794
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
795
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
796
- while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
797
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
798
-
799
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
800
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
801
- while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
802
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
803
-
804
- noisy_samples = (
805
- sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
806
- )
807
- return noisy_samples
808
-
809
- # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
810
- def get_velocity(
811
- self,
812
- sample: torch.FloatTensor,
813
- noise: torch.FloatTensor,
814
- timesteps: torch.IntTensor,
815
- ) -> torch.FloatTensor:
816
- # Make sure alphas_cumprod and timestep have same device and dtype as sample
817
- alphas_cumprod = self.alphas_cumprod.to(
818
- device=sample.device, dtype=sample.dtype
819
- )
820
- timesteps = timesteps.to(sample.device)
821
-
822
- sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
823
- sqrt_alpha_prod = sqrt_alpha_prod.flatten()
824
- while len(sqrt_alpha_prod.shape) < len(sample.shape):
825
- sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
826
-
827
- sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
828
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
829
- while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
830
- sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
831
-
832
- velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
833
- return velocity
834
-
835
- def __len__(self):
836
- return self.config.num_train_timesteps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- diffusers==0.21.4
2
  transformers==4.34.1
3
  gradio==3.50.2
4
  --extra-index-url https://download.pytorch.org/whl/cu121
 
1
+ diffusers==0.22.1
2
  transformers==4.34.1
3
  gradio==3.50.2
4
  --extra-index-url https://download.pytorch.org/whl/cu121