Spaces:
Running
Running
Add support for multimodal processor chat_template.json
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ hfapi = HfApi()
|
|
13 |
|
14 |
|
15 |
class ModelFiles(StrEnum):
|
|
|
16 |
TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
|
17 |
TOKENIZER_CONFIG = "tokenizer_config.json"
|
18 |
TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
|
@@ -27,6 +28,7 @@ example_labels = [
|
|
27 |
"Tool call with multiple responses",
|
28 |
"Tool call with complex tool definition",
|
29 |
"RAG call",
|
|
|
30 |
]
|
31 |
example_values = [
|
32 |
[
|
@@ -315,6 +317,37 @@ example_values = [
|
|
315 |
}
|
316 |
]""",
|
317 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
]
|
319 |
|
320 |
|
@@ -678,30 +711,32 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
678 |
org_template_tool_use = ""
|
679 |
org_template_rag = ""
|
680 |
|
681 |
-
|
682 |
-
|
|
|
|
|
683 |
|
684 |
-
|
685 |
-
|
686 |
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
|
706 |
tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
|
707 |
org_template = tokenizer_chat_template.get("data", org_template)
|
@@ -766,15 +801,33 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
766 |
operations = []
|
767 |
pr_branch = branch if branch.startswith("refs/pr/") else None
|
768 |
|
769 |
-
|
770 |
-
if
|
771 |
-
tokenizer_config = TokenizerConfig(
|
772 |
|
773 |
tokenizer_config.chat_templates["default"] = template
|
774 |
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
775 |
tokenizer_config.chat_templates["rag"] = template_rag
|
776 |
# tokenizer_config.inverse_template = template_inverse
|
777 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
778 |
new_config = tokenizer_config.json(get_json_indent(org_config))
|
779 |
if org_config.endswith("\n"):
|
780 |
new_config += "\n"
|
@@ -817,6 +870,10 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
817 |
|
818 |
info["parent_commit"] = commit.oid
|
819 |
|
|
|
|
|
|
|
|
|
820 |
if org_config:
|
821 |
tokenizer_file["data"] = new_config
|
822 |
tokenizer_file["content"] = json.loads(new_config)
|
@@ -950,6 +1007,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
950 |
"disabled": info.disabled,
|
951 |
"gated": info.gated,
|
952 |
"private": info.private,
|
|
|
953 |
}
|
954 |
|
955 |
template_messages = example_values[0][1]
|
@@ -991,6 +1049,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
991 |
progress = gr.Progress(track_tqdm = True),
|
992 |
oauth_token: gr.OAuthToken | None = None,
|
993 |
):
|
|
|
994 |
write_access = False
|
995 |
|
996 |
if info and oauth_token:
|
@@ -1097,6 +1156,34 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
1097 |
else:
|
1098 |
gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
|
1099 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1100 |
pr_details = None
|
1101 |
if branch and branch.startswith("refs/pr/"):
|
1102 |
pr_num = branch.split("/")[-1]
|
@@ -1129,9 +1216,9 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
1129 |
pr_submit: gr.Button(
|
1130 |
value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
|
1131 |
),
|
1132 |
-
|
1133 |
-
|
1134 |
-
|
1135 |
# inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
|
1136 |
# value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
|
1137 |
# ),
|
@@ -1198,7 +1285,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
1198 |
pr_preview_title,
|
1199 |
pr_description,
|
1200 |
pr_submit,
|
1201 |
-
|
1202 |
# inverse_template,
|
1203 |
],
|
1204 |
show_api = False,
|
|
|
13 |
|
14 |
|
15 |
class ModelFiles(StrEnum):
|
16 |
+
CHAT_TEMPLATE_JSON = "chat_template.json"
|
17 |
TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
|
18 |
TOKENIZER_CONFIG = "tokenizer_config.json"
|
19 |
TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
|
|
|
28 |
"Tool call with multiple responses",
|
29 |
"Tool call with complex tool definition",
|
30 |
"RAG call",
|
31 |
+
"Multimodal user message",
|
32 |
]
|
33 |
example_values = [
|
34 |
[
|
|
|
317 |
}
|
318 |
]""",
|
319 |
],
|
320 |
+
[
|
321 |
+
"{}",
|
322 |
+
"""[
|
323 |
+
{
|
324 |
+
"role": "user",
|
325 |
+
"content": [
|
326 |
+
{
|
327 |
+
"type": "text",
|
328 |
+
"content": "Can this animal"
|
329 |
+
},
|
330 |
+
{
|
331 |
+
"type": "image"
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"type": "text",
|
335 |
+
"content": "make this sound"
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"type": "audio"
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"type": "text",
|
342 |
+
"content": "while moving like this?"
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"type": "video"
|
346 |
+
}
|
347 |
+
]
|
348 |
+
}
|
349 |
+
]"""
|
350 |
+
]
|
351 |
]
|
352 |
|
353 |
|
|
|
711 |
org_template_tool_use = ""
|
712 |
org_template_rag = ""
|
713 |
|
714 |
+
for config_file_name in (ModelFiles.CHAT_TEMPLATE_JSON, ModelFiles.TOKENIZER_CONFIG):
|
715 |
+
config_file = info.get(config_file_name, {})
|
716 |
+
org_config = config_file.get("data")
|
717 |
+
org_content = config_file.get("content")
|
718 |
|
719 |
+
if org_content and ("chat_template" in org_content or not org_template):
|
720 |
+
tokenizer_config = TokenizerConfig(org_content)
|
721 |
|
722 |
+
org_template = tokenizer_config.chat_templates.get("default") or ""
|
723 |
+
org_template_tool_use = tokenizer_config.chat_templates.get("tool_use") or ""
|
724 |
+
org_template_rag = tokenizer_config.chat_templates.get("rag") or ""
|
725 |
+
# org_template_inverse = tokenizer_config.inverse_template or ""
|
726 |
|
727 |
+
tokenizer_config.chat_templates["default"] = template
|
728 |
+
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
729 |
+
tokenizer_config.chat_templates["rag"] = template_rag
|
730 |
+
# tokenizer_config.inverse_template = template_inverse
|
731 |
|
732 |
+
new_config = tokenizer_config.json(get_json_indent(org_config))
|
733 |
+
if org_config.endswith("\n"):
|
734 |
+
new_config += "\n"
|
735 |
|
736 |
+
changes += [
|
737 |
+
(token if token[1] in ("-", "+", "@") else token[1:].replace("\t", "\u21e5").replace("\r\n", "\u240d\u240a\r\n").replace("\r", "\u240d\r").replace("\n", "\u240a\n"), token[0] if token[0] != " " else None) # .replace(" ", "\u2423")
|
738 |
+
for token in unified_diff(new_config.splitlines(keepends = True), org_config.splitlines(keepends = True), fromfile = config_file_name, tofile = config_file_name)
|
739 |
+
]
|
740 |
|
741 |
tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
|
742 |
org_template = tokenizer_chat_template.get("data", org_template)
|
|
|
801 |
operations = []
|
802 |
pr_branch = branch if branch.startswith("refs/pr/") else None
|
803 |
|
804 |
+
chat_template_file = info.get(ModelFiles.CHAT_TEMPLATE_JSON, {})
|
805 |
+
if org_chat_template := chat_template_file.get("data"):
|
806 |
+
tokenizer_config = TokenizerConfig(chat_template_file.get("content"))
|
807 |
|
808 |
tokenizer_config.chat_templates["default"] = template
|
809 |
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
810 |
tokenizer_config.chat_templates["rag"] = template_rag
|
811 |
# tokenizer_config.inverse_template = template_inverse
|
812 |
|
813 |
+
new_chat_template = tokenizer_config.json(get_json_indent(org_chat_template))
|
814 |
+
if org_chat_template.endswith("\n"):
|
815 |
+
new_chat_template += "\n"
|
816 |
+
|
817 |
+
if org_chat_template != new_chat_template:
|
818 |
+
operations.append(CommitOperationAdd(ModelFiles.CHAT_TEMPLATE_JSON, new_chat_template.encode("utf-8")))
|
819 |
+
|
820 |
+
tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
|
821 |
+
if org_config := tokenizer_file.get("data"):
|
822 |
+
tokenizer_content = tokenizer_file.get("content")
|
823 |
+
tokenizer_config = TokenizerConfig(tokenizer_content)
|
824 |
+
|
825 |
+
if "chat_template" in tokenizer_content or not chat_template_file:
|
826 |
+
tokenizer_config.chat_templates["default"] = template
|
827 |
+
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
828 |
+
tokenizer_config.chat_templates["rag"] = template_rag
|
829 |
+
# tokenizer_config.inverse_template = template_inverse
|
830 |
+
|
831 |
new_config = tokenizer_config.json(get_json_indent(org_config))
|
832 |
if org_config.endswith("\n"):
|
833 |
new_config += "\n"
|
|
|
870 |
|
871 |
info["parent_commit"] = commit.oid
|
872 |
|
873 |
+
if org_chat_template:
|
874 |
+
chat_template_file["data"] = new_chat_template
|
875 |
+
chat_template_file["content"] = json.loads(new_chat_template)
|
876 |
+
|
877 |
if org_config:
|
878 |
tokenizer_file["data"] = new_config
|
879 |
tokenizer_file["content"] = json.loads(new_config)
|
|
|
1007 |
"disabled": info.disabled,
|
1008 |
"gated": info.gated,
|
1009 |
"private": info.private,
|
1010 |
+
"chat_template": templates,
|
1011 |
}
|
1012 |
|
1013 |
template_messages = example_values[0][1]
|
|
|
1049 |
progress = gr.Progress(track_tqdm = True),
|
1050 |
oauth_token: gr.OAuthToken | None = None,
|
1051 |
):
|
1052 |
+
parent_commit = None
|
1053 |
write_access = False
|
1054 |
|
1055 |
if info and oauth_token:
|
|
|
1156 |
else:
|
1157 |
gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
|
1158 |
|
1159 |
+
if not info.get("chat_template"):
|
1160 |
+
try:
|
1161 |
+
chat_template_file = None
|
1162 |
+
if (hfapi.file_exists(
|
1163 |
+
repo,
|
1164 |
+
ModelFiles.CHAT_TEMPLATE_JSON,
|
1165 |
+
revision = branch,
|
1166 |
+
token = oauth_token.token if oauth_token else False,
|
1167 |
+
)):
|
1168 |
+
chat_template_file = hfapi.hf_hub_download(
|
1169 |
+
repo,
|
1170 |
+
ModelFiles.CHAT_TEMPLATE_JSON,
|
1171 |
+
revision = parent_commit or branch,
|
1172 |
+
token = oauth_token.token if oauth_token else False,
|
1173 |
+
)
|
1174 |
+
except Exception as e:
|
1175 |
+
pass
|
1176 |
+
else:
|
1177 |
+
if chat_template_file:
|
1178 |
+
with open(chat_template_file, "r", encoding = "utf-8") as fp:
|
1179 |
+
template_data = fp.read()
|
1180 |
+
template_content = json.loads(template_data)
|
1181 |
+
info[ModelFiles.CHAT_TEMPLATE_JSON] = {
|
1182 |
+
"data": template_data,
|
1183 |
+
"content": template_content,
|
1184 |
+
}
|
1185 |
+
info["chat_template"] = template_content.get("chat_template")
|
1186 |
+
|
1187 |
pr_details = None
|
1188 |
if branch and branch.startswith("refs/pr/"):
|
1189 |
pr_num = branch.split("/")[-1]
|
|
|
1216 |
pr_submit: gr.Button(
|
1217 |
value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
|
1218 |
),
|
1219 |
+
chat_template: gr.skip() if ModelFiles.CHAT_TEMPLATE_JSON not in info else gr.Code(
|
1220 |
+
value = TokenizerConfig(info[ModelFiles.CHAT_TEMPLATE_JSON]["content"]).chat_templates.get("default"),
|
1221 |
+
),
|
1222 |
# inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
|
1223 |
# value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
|
1224 |
# ),
|
|
|
1285 |
pr_preview_title,
|
1286 |
pr_description,
|
1287 |
pr_submit,
|
1288 |
+
chat_template,
|
1289 |
# inverse_template,
|
1290 |
],
|
1291 |
show_api = False,
|