CISCai commited on
Commit
e0e0d39
·
verified ·
1 Parent(s): a901810

Add support for multimodal processor chat_template.json

Browse files
Files changed (1) hide show
  1. app.py +113 -26
app.py CHANGED
@@ -13,6 +13,7 @@ hfapi = HfApi()
13
 
14
 
15
  class ModelFiles(StrEnum):
 
16
  TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
17
  TOKENIZER_CONFIG = "tokenizer_config.json"
18
  TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
@@ -27,6 +28,7 @@ example_labels = [
27
  "Tool call with multiple responses",
28
  "Tool call with complex tool definition",
29
  "RAG call",
 
30
  ]
31
  example_values = [
32
  [
@@ -315,6 +317,37 @@ example_values = [
315
  }
316
  ]""",
317
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  ]
319
 
320
 
@@ -678,30 +711,32 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
678
  org_template_tool_use = ""
679
  org_template_rag = ""
680
 
681
- tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
682
- org_config = tokenizer_file.get("data")
 
 
683
 
684
- if org_config:
685
- tokenizer_config = TokenizerConfig(tokenizer_file.get("content"))
686
 
687
- org_template = tokenizer_config.chat_templates.get("default") or ""
688
- org_template_tool_use = tokenizer_config.chat_templates.get("tool_use") or ""
689
- org_template_rag = tokenizer_config.chat_templates.get("rag") or ""
690
- # org_template_inverse = tokenizer_config.inverse_template or ""
691
 
692
- tokenizer_config.chat_templates["default"] = template
693
- tokenizer_config.chat_templates["tool_use"] = template_tool_use
694
- tokenizer_config.chat_templates["rag"] = template_rag
695
- # tokenizer_config.inverse_template = template_inverse
696
 
697
- new_config = tokenizer_config.json(get_json_indent(org_config))
698
- if org_config.endswith("\n"):
699
- new_config += "\n"
700
 
701
- changes += [
702
- (token if token[1] in ("-", "+", "@") else token[1:].replace("\t", "\u21e5").replace("\r\n", "\u240d\u240a\r\n").replace("\r", "\u240d\r").replace("\n", "\u240a\n"), token[0] if token[0] != " " else None) # .replace(" ", "\u2423")
703
- for token in unified_diff(new_config.splitlines(keepends = True), org_config.splitlines(keepends = True), fromfile = ModelFiles.TOKENIZER_CONFIG, tofile = ModelFiles.TOKENIZER_CONFIG)
704
- ]
705
 
706
  tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
707
  org_template = tokenizer_chat_template.get("data", org_template)
@@ -766,15 +801,33 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
766
  operations = []
767
  pr_branch = branch if branch.startswith("refs/pr/") else None
768
 
769
- tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
770
- if org_config := tokenizer_file.get("data"):
771
- tokenizer_config = TokenizerConfig(tokenizer_file.get("content"))
772
 
773
  tokenizer_config.chat_templates["default"] = template
774
  tokenizer_config.chat_templates["tool_use"] = template_tool_use
775
  tokenizer_config.chat_templates["rag"] = template_rag
776
  # tokenizer_config.inverse_template = template_inverse
777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  new_config = tokenizer_config.json(get_json_indent(org_config))
779
  if org_config.endswith("\n"):
780
  new_config += "\n"
@@ -817,6 +870,10 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
817
 
818
  info["parent_commit"] = commit.oid
819
 
 
 
 
 
820
  if org_config:
821
  tokenizer_file["data"] = new_config
822
  tokenizer_file["content"] = json.loads(new_config)
@@ -950,6 +1007,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
950
  "disabled": info.disabled,
951
  "gated": info.gated,
952
  "private": info.private,
 
953
  }
954
 
955
  template_messages = example_values[0][1]
@@ -991,6 +1049,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
991
  progress = gr.Progress(track_tqdm = True),
992
  oauth_token: gr.OAuthToken | None = None,
993
  ):
 
994
  write_access = False
995
 
996
  if info and oauth_token:
@@ -1097,6 +1156,34 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
1097
  else:
1098
  gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
1099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1100
  pr_details = None
1101
  if branch and branch.startswith("refs/pr/"):
1102
  pr_num = branch.split("/")[-1]
@@ -1129,9 +1216,9 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
1129
  pr_submit: gr.Button(
1130
  value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
1131
  ),
1132
- # chat_template: gr.skip() if ModelFiles.TOKENIZER_CHAT_TEMPLATE not in info else gr.Code(
1133
- # value = info[ModelFiles.TOKENIZER_CHAT_TEMPLATE]["data"],
1134
- # ),
1135
  # inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
1136
  # value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
1137
  # ),
@@ -1198,7 +1285,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
1198
  pr_preview_title,
1199
  pr_description,
1200
  pr_submit,
1201
- # chat_template,
1202
  # inverse_template,
1203
  ],
1204
  show_api = False,
 
13
 
14
 
15
  class ModelFiles(StrEnum):
16
+ CHAT_TEMPLATE_JSON = "chat_template.json"
17
  TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
18
  TOKENIZER_CONFIG = "tokenizer_config.json"
19
  TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
 
28
  "Tool call with multiple responses",
29
  "Tool call with complex tool definition",
30
  "RAG call",
31
+ "Multimodal user message",
32
  ]
33
  example_values = [
34
  [
 
317
  }
318
  ]""",
319
  ],
320
+ [
321
+ "{}",
322
+ """[
323
+ {
324
+ "role": "user",
325
+ "content": [
326
+ {
327
+ "type": "text",
328
+ "content": "Can this animal"
329
+ },
330
+ {
331
+ "type": "image"
332
+ },
333
+ {
334
+ "type": "text",
335
+ "content": "make this sound"
336
+ },
337
+ {
338
+ "type": "audio"
339
+ },
340
+ {
341
+ "type": "text",
342
+ "content": "while moving like this?"
343
+ },
344
+ {
345
+ "type": "video"
346
+ }
347
+ ]
348
+ }
349
+ ]"""
350
+ ]
351
  ]
352
 
353
 
 
711
  org_template_tool_use = ""
712
  org_template_rag = ""
713
 
714
+ for config_file_name in (ModelFiles.CHAT_TEMPLATE_JSON, ModelFiles.TOKENIZER_CONFIG):
715
+ config_file = info.get(config_file_name, {})
716
+ org_config = config_file.get("data")
717
+ org_content = config_file.get("content")
718
 
719
+ if org_content and ("chat_template" in org_content or not org_template):
720
+ tokenizer_config = TokenizerConfig(org_content)
721
 
722
+ org_template = tokenizer_config.chat_templates.get("default") or ""
723
+ org_template_tool_use = tokenizer_config.chat_templates.get("tool_use") or ""
724
+ org_template_rag = tokenizer_config.chat_templates.get("rag") or ""
725
+ # org_template_inverse = tokenizer_config.inverse_template or ""
726
 
727
+ tokenizer_config.chat_templates["default"] = template
728
+ tokenizer_config.chat_templates["tool_use"] = template_tool_use
729
+ tokenizer_config.chat_templates["rag"] = template_rag
730
+ # tokenizer_config.inverse_template = template_inverse
731
 
732
+ new_config = tokenizer_config.json(get_json_indent(org_config))
733
+ if org_config.endswith("\n"):
734
+ new_config += "\n"
735
 
736
+ changes += [
737
+ (token if token[1] in ("-", "+", "@") else token[1:].replace("\t", "\u21e5").replace("\r\n", "\u240d\u240a\r\n").replace("\r", "\u240d\r").replace("\n", "\u240a\n"), token[0] if token[0] != " " else None) # .replace(" ", "\u2423")
738
+ for token in unified_diff(new_config.splitlines(keepends = True), org_config.splitlines(keepends = True), fromfile = config_file_name, tofile = config_file_name)
739
+ ]
740
 
741
  tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
742
  org_template = tokenizer_chat_template.get("data", org_template)
 
801
  operations = []
802
  pr_branch = branch if branch.startswith("refs/pr/") else None
803
 
804
+ chat_template_file = info.get(ModelFiles.CHAT_TEMPLATE_JSON, {})
805
+ if org_chat_template := chat_template_file.get("data"):
806
+ tokenizer_config = TokenizerConfig(chat_template_file.get("content"))
807
 
808
  tokenizer_config.chat_templates["default"] = template
809
  tokenizer_config.chat_templates["tool_use"] = template_tool_use
810
  tokenizer_config.chat_templates["rag"] = template_rag
811
  # tokenizer_config.inverse_template = template_inverse
812
 
813
+ new_chat_template = tokenizer_config.json(get_json_indent(org_chat_template))
814
+ if org_chat_template.endswith("\n"):
815
+ new_chat_template += "\n"
816
+
817
+ if org_chat_template != new_chat_template:
818
+ operations.append(CommitOperationAdd(ModelFiles.CHAT_TEMPLATE_JSON, new_chat_template.encode("utf-8")))
819
+
820
+ tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
821
+ if org_config := tokenizer_file.get("data"):
822
+ tokenizer_content = tokenizer_file.get("content")
823
+ tokenizer_config = TokenizerConfig(tokenizer_content)
824
+
825
+ if "chat_template" in tokenizer_content or not chat_template_file:
826
+ tokenizer_config.chat_templates["default"] = template
827
+ tokenizer_config.chat_templates["tool_use"] = template_tool_use
828
+ tokenizer_config.chat_templates["rag"] = template_rag
829
+ # tokenizer_config.inverse_template = template_inverse
830
+
831
  new_config = tokenizer_config.json(get_json_indent(org_config))
832
  if org_config.endswith("\n"):
833
  new_config += "\n"
 
870
 
871
  info["parent_commit"] = commit.oid
872
 
873
+ if org_chat_template:
874
+ chat_template_file["data"] = new_chat_template
875
+ chat_template_file["content"] = json.loads(new_chat_template)
876
+
877
  if org_config:
878
  tokenizer_file["data"] = new_config
879
  tokenizer_file["content"] = json.loads(new_config)
 
1007
  "disabled": info.disabled,
1008
  "gated": info.gated,
1009
  "private": info.private,
1010
+ "chat_template": templates,
1011
  }
1012
 
1013
  template_messages = example_values[0][1]
 
1049
  progress = gr.Progress(track_tqdm = True),
1050
  oauth_token: gr.OAuthToken | None = None,
1051
  ):
1052
+ parent_commit = None
1053
  write_access = False
1054
 
1055
  if info and oauth_token:
 
1156
  else:
1157
  gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
1158
 
1159
+ if not info.get("chat_template"):
1160
+ try:
1161
+ chat_template_file = None
1162
+ if (hfapi.file_exists(
1163
+ repo,
1164
+ ModelFiles.CHAT_TEMPLATE_JSON,
1165
+ revision = branch,
1166
+ token = oauth_token.token if oauth_token else False,
1167
+ )):
1168
+ chat_template_file = hfapi.hf_hub_download(
1169
+ repo,
1170
+ ModelFiles.CHAT_TEMPLATE_JSON,
1171
+ revision = parent_commit or branch,
1172
+ token = oauth_token.token if oauth_token else False,
1173
+ )
1174
+ except Exception as e:
1175
+ pass
1176
+ else:
1177
+ if chat_template_file:
1178
+ with open(chat_template_file, "r", encoding = "utf-8") as fp:
1179
+ template_data = fp.read()
1180
+ template_content = json.loads(template_data)
1181
+ info[ModelFiles.CHAT_TEMPLATE_JSON] = {
1182
+ "data": template_data,
1183
+ "content": template_content,
1184
+ }
1185
+ info["chat_template"] = template_content.get("chat_template")
1186
+
1187
  pr_details = None
1188
  if branch and branch.startswith("refs/pr/"):
1189
  pr_num = branch.split("/")[-1]
 
1216
  pr_submit: gr.Button(
1217
  value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
1218
  ),
1219
+ chat_template: gr.skip() if ModelFiles.CHAT_TEMPLATE_JSON not in info else gr.Code(
1220
+ value = TokenizerConfig(info[ModelFiles.CHAT_TEMPLATE_JSON]["content"]).chat_templates.get("default"),
1221
+ ),
1222
  # inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
1223
  # value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
1224
  # ),
 
1285
  pr_preview_title,
1286
  pr_description,
1287
  pr_submit,
1288
+ chat_template,
1289
  # inverse_template,
1290
  ],
1291
  show_api = False,