UDiffText / configs /test /textdesign_sd_2.yaml
ZYMPKU's picture
v1
ed25868
raw
history blame
4.54 kB
model:
target: sgm.models.diffusion.DiffusionEngine
params:
input_key: image
scale_factor: 0.18215
disable_first_stage_autocast: True
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
weighting_config:
target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
network_config:
target: sgm.modules.diffusionmodules.openaimodel.UNetAddModel
params:
use_checkpoint: False
in_channels: 9
out_channels: 4
ctrl_channels: 0
model_channels: 320
attention_resolutions: [4, 2, 1]
attn_type: add_attn
attn_layers:
- output_blocks.6.1
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_head_channels: 64
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 0
add_context_dim: 2048
legacy: False
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
# crossattn cond
# - is_trainable: False
# input_key: txt
# target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
# params:
# arch: ViT-H-14
# version: ./checkpoints/encoders/OpenCLIP/ViT-H-14/open_clip_pytorch_model.bin
# layer: penultimate
# add crossattn cond
- is_trainable: False
input_key: label
target: sgm.modules.encoders.modules.LabelEncoder
params:
is_add_embedder: True
max_len: 12
emb_dim: 2048
n_heads: 8
n_trans_layers: 12
ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt # ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt
# concat cond
- is_trainable: False
input_key: mask
target: sgm.modules.encoders.modules.IdentityEncoder
- is_trainable: False
input_key: masked
target: sgm.modules.encoders.modules.LatentEncoder
params:
scale_factor: 0.18215
config:
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
params:
ckpt_path: ./checkpoints/AEs/AE_inpainting_2.safetensors
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.FullLoss # StandardDiffusionLoss
params:
seq_len: 12
kernel_size: 3
gaussian_sigma: 0.5
min_attn_size: 16
lambda_local_loss: 0.02
lambda_ocr_loss: 0.001
ocr_enabled: False
predictor_config:
target: sgm.modules.predictors.model.ParseqPredictor
params:
ckpt_path: "./checkpoints/predictors/parseq-bb5792a6.pt"
sigma_sampler_config:
target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
params:
num_idx: 1000
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization