v1-inference.yaml
1.8 KB · 71 lines · yaml Raw
1 model:
2 base_learning_rate: 1.0e-04
3 target: ldm.models.diffusion.ddpm.LatentDiffusion
4 params:
5 linear_start: 0.00085
6 linear_end: 0.0120
7 num_timesteps_cond: 1
8 log_every_t: 200
9 timesteps: 1000
10 first_stage_key: "jpg"
11 cond_stage_key: "txt"
12 image_size: 64
13 channels: 4
14 cond_stage_trainable: false # Note: different from the one we trained before
15 conditioning_key: crossattn
16 monitor: val/loss_simple_ema
17 scale_factor: 0.18215
18 use_ema: False
19
20 scheduler_config: # 10000 warmup steps
21 target: ldm.lr_scheduler.LambdaLinearScheduler
22 params:
23 warm_up_steps: [ 10000 ]
24 cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
25 f_start: [ 1.e-6 ]
26 f_max: [ 1. ]
27 f_min: [ 1. ]
28
29 unet_config:
30 target: ldm.modules.diffusionmodules.openaimodel.UNetModel
31 params:
32 image_size: 32 # unused
33 in_channels: 4
34 out_channels: 4
35 model_channels: 320
36 attention_resolutions: [ 4, 2, 1 ]
37 num_res_blocks: 2
38 channel_mult: [ 1, 2, 4, 4 ]
39 num_heads: 8
40 use_spatial_transformer: True
41 transformer_depth: 1
42 context_dim: 768
43 use_checkpoint: True
44 legacy: False
45
46 first_stage_config:
47 target: ldm.models.autoencoder.AutoencoderKL
48 params:
49 embed_dim: 4
50 monitor: val/rec_loss
51 ddconfig:
52 double_z: true
53 z_channels: 4
54 resolution: 256
55 in_channels: 3
56 out_ch: 3
57 ch: 128
58 ch_mult:
59 - 1
60 - 2
61 - 4
62 - 4
63 num_res_blocks: 2
64 attn_resolutions: []
65 dropout: 0.0
66 lossconfig:
67 target: torch.nn.Identity
68
69 cond_stage_config:
70 target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
71