Spaces:
Sleeping
Sleeping
import spaces | |
import torch | |
from pipeline import PixArtSigmaPipeline | |
from diffusers.models import PixArtTransformer2DModel | |
import gradio as gr | |
# Load the pre-trained diffusion model | |
base_model = "ptx0/pixart-900m-1024-ft-v0.7-stage1" | |
stg2_model = "ptx0/pixart-900m-1024-ft-v0.7-stage2" | |
torch_device = "cuda" | |
torch_precision = torch.bfloat16 | |
base_pipeline = PixArtSigmaPipeline.from_pretrained( | |
base_model, use_safetensors=True | |
).to(dtype=torch_precision, device=torch_device) | |
stg2_pipeline = PixArtSigmaPipeline.from_pretrained(stg2_model, **base_pipeline.components) | |
stg2_pipeline.transformer = PixArtTransformer2DModel.from_pretrained(stg2_model, subfolder="transformer").to(dtype=torch_precision, device=torch_device) | |
import re | |
def extract_resolution(resolution_str): | |
match = re.match(r'(\d+)x(\d+)', resolution_str) | |
if match: | |
width = int(match.group(1)) | |
height = int(match.group(2)) | |
return (width, height) | |
else: | |
return None | |
# Define the image generation function with adjustable parameters and a progress bar | |
def generate(prompt, stage1_guidance_scale, stage2_guidance_scale, num_inference_steps, resolution, negative_prompt): | |
width, height = extract_resolution(resolution) or (1024, 1024) | |
mixture_generator = torch.Generator().manual_seed(444) | |
stage1_strength = 0.6 | |
latent_images = base_pipeline( | |
prompt=prompt, | |
negative_prompt=negative_prompt, | |
num_inference_steps=num_inference_steps, | |
num_images_per_prompt=1, | |
generator=mixture_generator, | |
guidance_scale=stage1_guidance_scale, | |
output_type="latent", | |
denoising_end=stage1_strength, | |
width=width, | |
height=height | |
).images | |
return stg2_pipeline( | |
prompt=prompt, | |
negative_prompt=negative_prompt, | |
latents=latent_images, | |
num_inference_steps=num_inference_steps, | |
num_images_per_prompt=1, | |
generator=mixture_generator, | |
guidance_scale=stage2_guidance_scale, | |
denoising_start=stage1_strength | |
).images | |
# Example prompts to demonstrate the model's capabilities | |
example_prompts = [ | |
[ | |
"A futuristic cityscape at night under a starry sky", | |
3.5, | |
4.5, | |
25, | |
"1152x960", | |
"blurry, overexposed" | |
], | |
[ | |
"A serene landscape with a flowing river and autumn trees", | |
3.0, | |
4.0, | |
20, | |
"1152x960", | |
"crowded, noisy" | |
], | |
[ | |
"An abstract painting of joy and energy in bright colors", | |
3.0, | |
4.5, | |
30, | |
"896x1152", | |
"dark, dull" | |
], | |
[ | |
"a stunning portrait of a hamster with an eye patch, piloting a miniature cessna on a wooden desk in an office, depth of field, bokeh, sharp, f1.4", | |
3.2, | |
4.6, | |
40, | |
"1024x1024", | |
"this is an ugly photograph that no one liked" | |
], | |
[ | |
"Check out my cousin larry in his dirty room, he is such a damn mess", | |
3.2, | |
4.6, | |
40, | |
"1152x960", | |
"the photograph is blurry and unremarkable" | |
] | |
] | |
# Create a Gradio interface, 1024x1024,1152x960,896x1152 | |
iface = gr.Interface( | |
fn=generate, | |
inputs=[ | |
gr.Text(label="Enter your prompt"), | |
gr.Slider(1, 20, step=0.1, label="Guidance Scale (Stage I)", value=3.4), | |
gr.Slider(1, 20, step=0.1, label="Guidance Scale (Stage II)", value=4.2), | |
gr.Slider(1, 50, step=1, label="Number of Inference Steps", value=35), | |
gr.Radio(["1024x1024", "1152x960", "896x1152"], label="Resolution", value="1024x1024"), | |
gr.Text(value="underexposed, blurry, ugly, washed-out", label="Negative Prompt") | |
], | |
outputs=gr.Gallery(height=1024, min_width=1024, columns=2), | |
examples=example_prompts, | |
title="PixArt 900M", | |
description=( | |
"This is a two-stage mixture-of-experts model implemented in the spirit of NVIDIA's E-Diffi model." | |
"<br />The weights were initialised from <strong>terminusresearch/pixart-900m-1024-ft-v0.6</strong> and trained separately on timestep ranges <strong>999-400</strong> and <strong>400-0</strong>." | |
"<br />This results in two models where the first stage is responsible for most of the image's composition and colour, and the second stage handles minor-to-fine details." | |
"<br />" | |
"<br />In comparison to SDXL's refiner, the second stage here handles twice as many timesteps, which allows it to make more use of the text-conditional guidance, improving its capabilities." | |
"<br />" | |
"<br />Despite being trained with 40% of the schedule, you will discover that using stage 2 stand-alone as a refiner (img2img) will need half the strength - about 20%." | |
"<br />When being used in the two-stage pipeline, it should be configured to handle all of its 40% range." | |
"<br />" | |
"<br />This model is funded and trained by <strong>Terminus Research Group</strong>." | |
"<br />For the final training session of ~220,000 steps, <a href='https://lambdalabs.com/'>Lambda Labs</a> provided a generous credit for the use of 8x A100 systems for two weeks." | |
"<br />If you would like to collaborate or provide compute, please see the organisation page for how to locate us on Discord." | |
"<br />" | |
"<br />" | |
"<ul>" | |
"<li>Lead trainer: @pseudoterminalx (bghira@GitHub)</li>" | |
"</ul>" | |
) | |
).launch() | |