Fabrice-TIERCELIN commited on
Commit
dfd1126
·
verified ·
1 Parent(s): 0c4a889

' instead of "

Browse files
Files changed (1) hide show
  1. hyvideo/constants.py +90 -90
hyvideo/constants.py CHANGED
@@ -1,90 +1,90 @@
1
- import os
2
- import torch
3
-
4
- __all__ = [
5
- "C_SCALE",
6
- "PROMPT_TEMPLATE",
7
- "MODEL_BASE",
8
- "PRECISIONS",
9
- "NORMALIZATION_TYPE",
10
- "ACTIVATION_TYPE",
11
- "VAE_PATH",
12
- "TEXT_ENCODER_PATH",
13
- "TOKENIZER_PATH",
14
- "TEXT_PROJECTION",
15
- "DATA_TYPE",
16
- "NEGATIVE_PROMPT",
17
- ]
18
-
19
- PRECISION_TO_TYPE = {
20
- 'fp32': torch.float32,
21
- 'fp16': torch.float16,
22
- 'bf16': torch.bfloat16,
23
- }
24
-
25
- # =================== Constant Values =====================
26
- # Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
27
- # overflow error when tensorboard logging values.
28
- C_SCALE = 1_000_000_000_000_000
29
-
30
- # When using decoder-only models, we must provide a prompt template to instruct the text encoder
31
- # on how to generate the text.
32
- # --------------------------------------------------------------------
33
- PROMPT_TEMPLATE_ENCODE = (
34
- "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
35
- "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
36
- "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
37
- )
38
- PROMPT_TEMPLATE_ENCODE_VIDEO = (
39
- "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
40
- "1. The main content and theme of the video."
41
- "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
42
- "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
43
- "4. background environment, light, style and atmosphere."
44
- "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
45
- "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
46
- )
47
-
48
- NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
49
-
50
- PROMPT_TEMPLATE = {
51
- "dit-llm-encode": {
52
- "template": PROMPT_TEMPLATE_ENCODE,
53
- "crop_start": 36,
54
- },
55
- "dit-llm-encode-video": {
56
- "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
57
- "crop_start": 95,
58
- },
59
- }
60
-
61
- # ======================= Model ======================
62
- PRECISIONS = {"fp32", "fp16", "bf16"}
63
- NORMALIZATION_TYPE = {"layer", "rms"}
64
- ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
65
-
66
- # =================== Model Path =====================
67
- MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
68
-
69
- # =================== Data =======================
70
- DATA_TYPE = {"image", "video", "image_video"}
71
-
72
- # 3D VAE
73
- VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
74
-
75
- # Text Encoder
76
- TEXT_ENCODER_PATH = {
77
- "clipL": f"{MODEL_BASE}/text_encoder_2",
78
- "llm": f"{MODEL_BASE}/text_encoder",
79
- }
80
-
81
- # Tokenizer
82
- TOKENIZER_PATH = {
83
- "clipL": f"{MODEL_BASE}/text_encoder_2",
84
- "llm": f"{MODEL_BASE}/text_encoder",
85
- }
86
-
87
- TEXT_PROJECTION = {
88
- "linear", # Default, an nn.Linear() layer
89
- "single_refiner", # Single TokenRefiner. Refer to LI-DiT
90
- }
 
1
+ import os
2
+ import torch
3
+
4
+ __all__ = [
5
+ "C_SCALE",
6
+ "PROMPT_TEMPLATE",
7
+ "MODEL_BASE",
8
+ "PRECISIONS",
9
+ "NORMALIZATION_TYPE",
10
+ "ACTIVATION_TYPE",
11
+ "VAE_PATH",
12
+ "TEXT_ENCODER_PATH",
13
+ "TOKENIZER_PATH",
14
+ "TEXT_PROJECTION",
15
+ "DATA_TYPE",
16
+ "NEGATIVE_PROMPT",
17
+ ]
18
+
19
+ PRECISION_TO_TYPE = {
20
+ "fp32": torch.float32,
21
+ "fp16": torch.float16,
22
+ "bf16": torch.bfloat16,
23
+ }
24
+
25
+ # =================== Constant Values =====================
26
+ # Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
27
+ # overflow error when tensorboard logging values.
28
+ C_SCALE = 1_000_000_000_000_000
29
+
30
+ # When using decoder-only models, we must provide a prompt template to instruct the text encoder
31
+ # on how to generate the text.
32
+ # --------------------------------------------------------------------
33
+ PROMPT_TEMPLATE_ENCODE = (
34
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
35
+ "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
36
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
37
+ )
38
+ PROMPT_TEMPLATE_ENCODE_VIDEO = (
39
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
40
+ "1. The main content and theme of the video."
41
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
42
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
43
+ "4. background environment, light, style and atmosphere."
44
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
45
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
46
+ )
47
+
48
+ NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
49
+
50
+ PROMPT_TEMPLATE = {
51
+ "dit-llm-encode": {
52
+ "template": PROMPT_TEMPLATE_ENCODE,
53
+ "crop_start": 36,
54
+ },
55
+ "dit-llm-encode-video": {
56
+ "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
57
+ "crop_start": 95,
58
+ },
59
+ }
60
+
61
+ # ======================= Model ======================
62
+ PRECISIONS = {"fp32", "fp16", "bf16"}
63
+ NORMALIZATION_TYPE = {"layer", "rms"}
64
+ ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
65
+
66
+ # =================== Model Path =====================
67
+ MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
68
+
69
+ # =================== Data =======================
70
+ DATA_TYPE = {"image", "video", "image_video"}
71
+
72
+ # 3D VAE
73
+ VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
74
+
75
+ # Text Encoder
76
+ TEXT_ENCODER_PATH = {
77
+ "clipL": f"{MODEL_BASE}/text_encoder_2",
78
+ "llm": f"{MODEL_BASE}/text_encoder",
79
+ }
80
+
81
+ # Tokenizer
82
+ TOKENIZER_PATH = {
83
+ "clipL": f"{MODEL_BASE}/text_encoder_2",
84
+ "llm": f"{MODEL_BASE}/text_encoder",
85
+ }
86
+
87
+ TEXT_PROJECTION = {
88
+ "linear", # Default, an nn.Linear() layer
89
+ "single_refiner", # Single TokenRefiner. Refer to LI-DiT
90
+ }