Update webgui.py
Browse files
webgui.py
CHANGED
@@ -35,6 +35,7 @@ huggingface_hub.snapshot_download(
|
|
35 |
local_dir_use_symlinks=False,
|
36 |
)
|
37 |
|
|
|
38 |
is_shared_ui = True if "fffiloni/EchoMimic" in os.environ['SPACE_ID'] else False
|
39 |
available_property = False if is_shared_ui else True
|
40 |
advanced_settings_label = "Advanced Configuration (only for duplicated spaces)" if is_shared_ui else "Advanced Configuration"
|
@@ -73,13 +74,16 @@ else:
|
|
73 |
device = "cuda"
|
74 |
if not torch.cuda.is_available():
|
75 |
device = "cpu"
|
|
|
|
|
76 |
|
77 |
inference_config_path = config.inference_config
|
78 |
infer_config = OmegaConf.load(inference_config_path)
|
79 |
|
80 |
############# model_init started #############
|
81 |
## vae init
|
82 |
-
vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
|
|
|
83 |
|
84 |
## reference net init
|
85 |
reference_unet = UNet2DConditionModel.from_pretrained(
|
@@ -113,7 +117,8 @@ else:
|
|
113 |
denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
|
114 |
|
115 |
## face locator init
|
116 |
-
face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
|
|
|
117 |
face_locator.load_state_dict(torch.load(config.face_locator_path))
|
118 |
|
119 |
## load audio processor params
|
@@ -134,7 +139,7 @@ pipe = Audio2VideoPipeline(
|
|
134 |
audio_guider=audio_processor,
|
135 |
face_locator=face_locator,
|
136 |
scheduler=scheduler,
|
137 |
-
).to("
|
138 |
|
139 |
def select_face(det_bboxes, probs):
|
140 |
## max face from faces that the prob is above 0.8
|
@@ -182,7 +187,8 @@ def process_video(uploaded_img, uploaded_audio, width, height, length, seed, fac
|
|
182 |
face_mask = cv2.resize(face_mask, (width, height))
|
183 |
|
184 |
ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
|
185 |
-
face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
|
|
|
186 |
|
187 |
video = pipe(
|
188 |
ref_image_pil,
|
|
|
35 |
local_dir_use_symlinks=False,
|
36 |
)
|
37 |
|
38 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
39 |
is_shared_ui = True if "fffiloni/EchoMimic" in os.environ['SPACE_ID'] else False
|
40 |
available_property = False if is_shared_ui else True
|
41 |
advanced_settings_label = "Advanced Configuration (only for duplicated spaces)" if is_shared_ui else "Advanced Configuration"
|
|
|
74 |
device = "cuda"
|
75 |
if not torch.cuda.is_available():
|
76 |
device = "cpu"
|
77 |
+
device = "cpu"
|
78 |
+
torch.cuda.is_available = lambda : False
|
79 |
|
80 |
inference_config_path = config.inference_config
|
81 |
infer_config = OmegaConf.load(inference_config_path)
|
82 |
|
83 |
############# model_init started #############
|
84 |
## vae init
|
85 |
+
# vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cuda", dtype=weight_dtype)
|
86 |
+
vae = AutoencoderKL.from_pretrained(config.pretrained_vae_path).to("cpu", dtype=weight_dtype)
|
87 |
|
88 |
## reference net init
|
89 |
reference_unet = UNet2DConditionModel.from_pretrained(
|
|
|
117 |
denoising_unet.load_state_dict(torch.load(config.denoising_unet_path, map_location="cpu"), strict=False)
|
118 |
|
119 |
## face locator init
|
120 |
+
# face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")
|
121 |
+
face_locator = FaceLocator(320, conditioning_channels=1, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cpu")
|
122 |
face_locator.load_state_dict(torch.load(config.face_locator_path))
|
123 |
|
124 |
## load audio processor params
|
|
|
139 |
audio_guider=audio_processor,
|
140 |
face_locator=face_locator,
|
141 |
scheduler=scheduler,
|
142 |
+
).to("cpu", dtype=weight_dtype)
|
143 |
|
144 |
def select_face(det_bboxes, probs):
|
145 |
## max face from faces that the prob is above 0.8
|
|
|
187 |
face_mask = cv2.resize(face_mask, (width, height))
|
188 |
|
189 |
ref_image_pil = Image.fromarray(face_img[:, :, [2, 1, 0]])
|
190 |
+
# face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cuda").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
|
191 |
+
face_mask_tensor = torch.Tensor(face_mask).to(dtype=weight_dtype, device="cpu").unsqueeze(0).unsqueeze(0).unsqueeze(0) / 255.0
|
192 |
|
193 |
video = pipe(
|
194 |
ref_image_pil,
|