Spaces:
Running
on
Zero
Running
on
Zero
Update pops.py
Browse files
pops.py
CHANGED
@@ -15,15 +15,15 @@ prior_instruct_repo: str = 'models/instruct/learned_prior.pth'
|
|
15 |
prior_scene_repo: str = 'models/scene/learned_prior.pth'
|
16 |
prior_repo = "pOpsPaper/operators"
|
17 |
|
18 |
-
gpu = torch.device('cuda')
|
19 |
-
cpu = torch.device('cpu')
|
20 |
|
21 |
class PopsPipelines:
|
22 |
def __init__(self):
|
23 |
weight_dtype = torch.float16
|
24 |
self.weight_dtype = weight_dtype
|
25 |
-
device = 'cuda
|
26 |
-
self.device = device
|
27 |
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
|
28 |
subfolder='image_encoder',
|
29 |
torch_dtype=weight_dtype).eval()
|
@@ -84,6 +84,7 @@ class PopsPipelines:
|
|
84 |
return image
|
85 |
|
86 |
def process_text(self, text):
|
|
|
87 |
text_inputs = self.tokenizer(
|
88 |
text,
|
89 |
padding="max_length",
|
@@ -96,12 +97,14 @@ class PopsPipelines:
|
|
96 |
text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
|
97 |
text_encoder_hidden_states = text_encoder_output.last_hidden_state
|
98 |
text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
|
|
|
99 |
return text_encoder_concat
|
100 |
|
101 |
def run_binary(self, input_a, input_b, prior_type):
|
102 |
# Move pipeline to GPU
|
103 |
pipeline = self.priors_dict[prior_type]['pipeline']
|
104 |
pipeline.to('cuda')
|
|
|
105 |
input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
|
106 |
self.image_encoder,
|
107 |
pipeline.prior.clip_mean.detach(),
|
@@ -131,14 +134,17 @@ class PopsPipelines:
|
|
131 |
|
132 |
# Move pipeline to CPU
|
133 |
pipeline.to('cpu')
|
|
|
134 |
return img_emb
|
135 |
|
136 |
def run_instruct(self, input_a, text):
|
|
|
137 |
text_encodings = self.process_text(text)
|
138 |
|
139 |
# Move pipeline to GPU
|
140 |
instruct_pipeline = self.priors_dict['instruct']['pipeline']
|
141 |
instruct_pipeline.to('cuda')
|
|
|
142 |
input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
|
143 |
self.image_encoder,
|
144 |
instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
|
@@ -155,13 +161,15 @@ class PopsPipelines:
|
|
155 |
|
156 |
# Move pipeline to CPU
|
157 |
instruct_pipeline.to('cpu')
|
|
|
158 |
return img_emb
|
159 |
|
160 |
def render(self, img_emb):
|
|
|
161 |
images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
|
162 |
num_inference_steps=50, height=512,
|
163 |
width=512, guidance_scale=4).images
|
164 |
-
|
165 |
return images[0]
|
166 |
|
167 |
def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
|
|
|
15 |
prior_scene_repo: str = 'models/scene/learned_prior.pth'
|
16 |
prior_repo = "pOpsPaper/operators"
|
17 |
|
18 |
+
# gpu = torch.device('cuda')
|
19 |
+
# cpu = torch.device('cpu')
|
20 |
|
21 |
class PopsPipelines:
|
22 |
def __init__(self):
|
23 |
weight_dtype = torch.float16
|
24 |
self.weight_dtype = weight_dtype
|
25 |
+
device = 'cpu' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
self.device = 'cuda' #device
|
27 |
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(kandinsky_prior_repo,
|
28 |
subfolder='image_encoder',
|
29 |
torch_dtype=weight_dtype).eval()
|
|
|
84 |
return image
|
85 |
|
86 |
def process_text(self, text):
|
87 |
+
self.text_encoder.to('cuda')
|
88 |
text_inputs = self.tokenizer(
|
89 |
text,
|
90 |
padding="max_length",
|
|
|
97 |
text_encoder_output = self.text_encoder(text_inputs.input_ids.to(self.device))
|
98 |
text_encoder_hidden_states = text_encoder_output.last_hidden_state
|
99 |
text_encoder_concat = text_encoder_hidden_states[:, :mask.sum().item()]
|
100 |
+
self.text_encoder.to('cpu')
|
101 |
return text_encoder_concat
|
102 |
|
103 |
def run_binary(self, input_a, input_b, prior_type):
|
104 |
# Move pipeline to GPU
|
105 |
pipeline = self.priors_dict[prior_type]['pipeline']
|
106 |
pipeline.to('cuda')
|
107 |
+
self.image_encoder.to('cuda')
|
108 |
input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, input_b,
|
109 |
self.image_encoder,
|
110 |
pipeline.prior.clip_mean.detach(),
|
|
|
134 |
|
135 |
# Move pipeline to CPU
|
136 |
pipeline.to('cpu')
|
137 |
+
self.image_encoder.to('cpu')
|
138 |
return img_emb
|
139 |
|
140 |
def run_instruct(self, input_a, text):
|
141 |
+
|
142 |
text_encodings = self.process_text(text)
|
143 |
|
144 |
# Move pipeline to GPU
|
145 |
instruct_pipeline = self.priors_dict['instruct']['pipeline']
|
146 |
instruct_pipeline.to('cuda')
|
147 |
+
self.image_encoder.to('cuda')
|
148 |
input_image_embeds, input_hidden_state = pops_utils.preprocess(input_a, None,
|
149 |
self.image_encoder,
|
150 |
instruct_pipeline.prior.clip_mean.detach(), instruct_pipeline.prior.clip_std.detach(),
|
|
|
161 |
|
162 |
# Move pipeline to CPU
|
163 |
instruct_pipeline.to('cpu')
|
164 |
+
self.image_encoder.to('cpu')
|
165 |
return img_emb
|
166 |
|
167 |
def render(self, img_emb):
|
168 |
+
self.decoder.to('cuda')
|
169 |
images = self.decoder(image_embeds=img_emb.image_embeds, negative_image_embeds=img_emb.negative_image_embeds,
|
170 |
num_inference_steps=50, height=512,
|
171 |
width=512, guidance_scale=4).images
|
172 |
+
self.decoder.to('cpu')
|
173 |
return images[0]
|
174 |
|
175 |
def run_instruct_texture(self, image_object_path, text_instruct, image_texture_path):
|