Feature Extraction
File size: 2,786 Bytes
6125d7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
{
    "model": "vitamin_large",
    "exp_name": "unitok_large_causal",
    "output_dir": "local_output",
    "resume_from": "",
    "lpips_path": "",
    "dino_path": "",
    "fid_eval_src": "",
    "fid_eval_dst": "",
    "vis_img_dir": "asset/vis_imgs/",
    "fid_feature_extractor": "",
    "clip_pretrain_path": "",
    "fp16": false,
    "bf16": true,
    "tf32": true,
    "compile_model": false,
    "ddp_static": false,
    "grad_ckpt": true,
    "grad_accu": 1,
    "train_data": "",
    "val_data": null,
    "dataset_type": "webdataset",
    "imagenet_val": "",
    "imagenet_v2": null,
    "subset_ratio": 1.0,
    "img_size": 256,
    "resize_ratio": 1.125,
    "hflip": false,
    "workers": 16,
    "train_num_samples": 1280000000,
    "train_data_upsampling_factors": null,
    "dataset_resampled": false,
    "use_aug": false,
    "vocab_size": 32768,
    "vocab_width": 64,
    "vocab_norm": true,
    "vq_beta": 0.25,
    "num_codebooks": 8,
    "quant_proj": "attn",
    "embed_dim": 768,
    "num_query": 0,
    "use_clip_pretrain": false,
    "patch_size": 16,
    "drop_path": 0.1,
    "text_width": 768,
    "text_heads": 12,
    "text_layers": 12,
    "text_vocab_size": 49408,
    "text_context_length": 77,
    "local_loss": true,
    "gather_with_grad": true,
    "pretrained_clip": null,
    "pretrained_clip_text": null,
    "lock_text": false,
    "lock_text_unlocked_layers": 0,
    "lock_text_freeze_layer_norm": false,
    "force_custom_text": false,
    "force_custom_vision": false,
    "zeroshot_eval_freq": 1,
    "dino_depth": 12,
    "dino_kernel_size": 9,
    "disc_norm": "gn",
    "disc_aug_prob": 1.0,
    "disc_specnorm": false,
    "step_disc_every": 1,
    "vae_init": -0.5,
    "vocab_init": -1,
    "disc_init": -0.5,
    "epoch": 1,
    "local_bs": 56,
    "vae_local_bs": 56,
    "global_bs": 16384,
    "lr": 0.0005,
    "wd": 0.02,
    "disc_lr": 2e-05,
    "disc_wd": 0.2,
    "grad_clip": 10,
    "ema": 0.9999,
    "warmup_iter": null,
    "warmup_ep": 0.01,
    "disc_start_ep": 0.375,
    "disc_warmup_ep": 0.03,
    "schedule": "cos",
    "lr_start_ratio": 0.0,
    "lr_end_ratio": 0.1,
    "disc_lr_end_ratio": 0.1,
    "custom_lr_multiplier": null,
    "optimizer": "adamw",
    "optim_eps": 1e-06,
    "fuse_opt": false,
    "optim_beta": "0.9_0.95",
    "disc_optim_beta": "0.5_0.9",
    "l1": 0.2,
    "l2": 1.0,
    "lp": 1.0,
    "lpr": 48,
    "ld": 0.4,
    "le": 0.0,
    "lq": 1.0,
    "lc": 1.0,
    "e_temp": 0.01,
    "gada": 1,
    "bcr": 4.0,
    "bcr_cut": 0.2,
    "dcrit": "hg",
    "report_wandb": true,
    "wandb_notes": null,
    "run_id": null,
    "eval_per_epoch": 8,
    "dbg_unused_param": false,
    "dbg_nan": false,
    "seed": null,
    "deterministic": false,
    "same_seed_for_all_ranks": 0
}