Fabrice-TIERCELIN
commited on
' instead of "
Browse files- hyvideo/constants.py +90 -90
hyvideo/constants.py
CHANGED
@@ -1,90 +1,90 @@
|
|
1 |
-
import os
|
2 |
-
import torch
|
3 |
-
|
4 |
-
__all__ = [
|
5 |
-
"C_SCALE",
|
6 |
-
"PROMPT_TEMPLATE",
|
7 |
-
"MODEL_BASE",
|
8 |
-
"PRECISIONS",
|
9 |
-
"NORMALIZATION_TYPE",
|
10 |
-
"ACTIVATION_TYPE",
|
11 |
-
"VAE_PATH",
|
12 |
-
"TEXT_ENCODER_PATH",
|
13 |
-
"TOKENIZER_PATH",
|
14 |
-
"TEXT_PROJECTION",
|
15 |
-
"DATA_TYPE",
|
16 |
-
"NEGATIVE_PROMPT",
|
17 |
-
]
|
18 |
-
|
19 |
-
PRECISION_TO_TYPE = {
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
}
|
24 |
-
|
25 |
-
# =================== Constant Values =====================
|
26 |
-
# Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
|
27 |
-
# overflow error when tensorboard logging values.
|
28 |
-
C_SCALE = 1_000_000_000_000_000
|
29 |
-
|
30 |
-
# When using decoder-only models, we must provide a prompt template to instruct the text encoder
|
31 |
-
# on how to generate the text.
|
32 |
-
# --------------------------------------------------------------------
|
33 |
-
PROMPT_TEMPLATE_ENCODE = (
|
34 |
-
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
|
35 |
-
"quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
36 |
-
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
37 |
-
)
|
38 |
-
PROMPT_TEMPLATE_ENCODE_VIDEO = (
|
39 |
-
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
|
40 |
-
"1. The main content and theme of the video."
|
41 |
-
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
42 |
-
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
43 |
-
"4. background environment, light, style and atmosphere."
|
44 |
-
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
|
45 |
-
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
46 |
-
)
|
47 |
-
|
48 |
-
NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
|
49 |
-
|
50 |
-
PROMPT_TEMPLATE = {
|
51 |
-
"dit-llm-encode": {
|
52 |
-
"template": PROMPT_TEMPLATE_ENCODE,
|
53 |
-
"crop_start": 36,
|
54 |
-
},
|
55 |
-
"dit-llm-encode-video": {
|
56 |
-
"template": PROMPT_TEMPLATE_ENCODE_VIDEO,
|
57 |
-
"crop_start": 95,
|
58 |
-
},
|
59 |
-
}
|
60 |
-
|
61 |
-
# ======================= Model ======================
|
62 |
-
PRECISIONS = {"fp32", "fp16", "bf16"}
|
63 |
-
NORMALIZATION_TYPE = {"layer", "rms"}
|
64 |
-
ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
|
65 |
-
|
66 |
-
# =================== Model Path =====================
|
67 |
-
MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
|
68 |
-
|
69 |
-
# =================== Data =======================
|
70 |
-
DATA_TYPE = {"image", "video", "image_video"}
|
71 |
-
|
72 |
-
# 3D VAE
|
73 |
-
VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
|
74 |
-
|
75 |
-
# Text Encoder
|
76 |
-
TEXT_ENCODER_PATH = {
|
77 |
-
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
78 |
-
"llm": f"{MODEL_BASE}/text_encoder",
|
79 |
-
}
|
80 |
-
|
81 |
-
# Tokenizer
|
82 |
-
TOKENIZER_PATH = {
|
83 |
-
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
84 |
-
"llm": f"{MODEL_BASE}/text_encoder",
|
85 |
-
}
|
86 |
-
|
87 |
-
TEXT_PROJECTION = {
|
88 |
-
"linear", # Default, an nn.Linear() layer
|
89 |
-
"single_refiner", # Single TokenRefiner. Refer to LI-DiT
|
90 |
-
}
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
|
4 |
+
__all__ = [
|
5 |
+
"C_SCALE",
|
6 |
+
"PROMPT_TEMPLATE",
|
7 |
+
"MODEL_BASE",
|
8 |
+
"PRECISIONS",
|
9 |
+
"NORMALIZATION_TYPE",
|
10 |
+
"ACTIVATION_TYPE",
|
11 |
+
"VAE_PATH",
|
12 |
+
"TEXT_ENCODER_PATH",
|
13 |
+
"TOKENIZER_PATH",
|
14 |
+
"TEXT_PROJECTION",
|
15 |
+
"DATA_TYPE",
|
16 |
+
"NEGATIVE_PROMPT",
|
17 |
+
]
|
18 |
+
|
19 |
+
PRECISION_TO_TYPE = {
|
20 |
+
"fp32": torch.float32,
|
21 |
+
"fp16": torch.float16,
|
22 |
+
"bf16": torch.bfloat16,
|
23 |
+
}
|
24 |
+
|
25 |
+
# =================== Constant Values =====================
|
26 |
+
# Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
|
27 |
+
# overflow error when tensorboard logging values.
|
28 |
+
C_SCALE = 1_000_000_000_000_000
|
29 |
+
|
30 |
+
# When using decoder-only models, we must provide a prompt template to instruct the text encoder
|
31 |
+
# on how to generate the text.
|
32 |
+
# --------------------------------------------------------------------
|
33 |
+
PROMPT_TEMPLATE_ENCODE = (
|
34 |
+
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
|
35 |
+
"quantity, text, spatial relationships of the objects and background:<|eot_id|>"
|
36 |
+
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
37 |
+
)
|
38 |
+
PROMPT_TEMPLATE_ENCODE_VIDEO = (
|
39 |
+
"<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
|
40 |
+
"1. The main content and theme of the video."
|
41 |
+
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
|
42 |
+
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
|
43 |
+
"4. background environment, light, style and atmosphere."
|
44 |
+
"5. camera angles, movements, and transitions used in the video:<|eot_id|>"
|
45 |
+
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
|
46 |
+
)
|
47 |
+
|
48 |
+
NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
|
49 |
+
|
50 |
+
PROMPT_TEMPLATE = {
|
51 |
+
"dit-llm-encode": {
|
52 |
+
"template": PROMPT_TEMPLATE_ENCODE,
|
53 |
+
"crop_start": 36,
|
54 |
+
},
|
55 |
+
"dit-llm-encode-video": {
|
56 |
+
"template": PROMPT_TEMPLATE_ENCODE_VIDEO,
|
57 |
+
"crop_start": 95,
|
58 |
+
},
|
59 |
+
}
|
60 |
+
|
61 |
+
# ======================= Model ======================
|
62 |
+
PRECISIONS = {"fp32", "fp16", "bf16"}
|
63 |
+
NORMALIZATION_TYPE = {"layer", "rms"}
|
64 |
+
ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
|
65 |
+
|
66 |
+
# =================== Model Path =====================
|
67 |
+
MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
|
68 |
+
|
69 |
+
# =================== Data =======================
|
70 |
+
DATA_TYPE = {"image", "video", "image_video"}
|
71 |
+
|
72 |
+
# 3D VAE
|
73 |
+
VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
|
74 |
+
|
75 |
+
# Text Encoder
|
76 |
+
TEXT_ENCODER_PATH = {
|
77 |
+
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
78 |
+
"llm": f"{MODEL_BASE}/text_encoder",
|
79 |
+
}
|
80 |
+
|
81 |
+
# Tokenizer
|
82 |
+
TOKENIZER_PATH = {
|
83 |
+
"clipL": f"{MODEL_BASE}/text_encoder_2",
|
84 |
+
"llm": f"{MODEL_BASE}/text_encoder",
|
85 |
+
}
|
86 |
+
|
87 |
+
TEXT_PROJECTION = {
|
88 |
+
"linear", # Default, an nn.Linear() layer
|
89 |
+
"single_refiner", # Single TokenRefiner. Refer to LI-DiT
|
90 |
+
}
|