Spaces:

CISCai
/

chat-template-editor

Running

App Files Files Community

CISCai commited on Oct 31, 2024

Commit

e0e0d39

verified ·

1 Parent(s): a901810

Add support for multimodal processor chat_template.json

Browse files

Files changed (1) hide show

app.py +113 -26

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ hfapi = HfApi()
 class ModelFiles(StrEnum):
     TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
     TOKENIZER_CONFIG = "tokenizer_config.json"
     TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
@@ -27,6 +28,7 @@ example_labels = [
     "Tool call with multiple responses",
     "Tool call with complex tool definition",
     "RAG call",
 ]
 example_values = [
     [
@@ -315,6 +317,37 @@ example_values = [
   }
 ]""",
     ],
 ]
@@ -678,30 +711,32 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
         org_template_tool_use = ""
         org_template_rag = ""
-        tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
-        org_config = tokenizer_file.get("data")
-        if org_config:
-            tokenizer_config = TokenizerConfig(tokenizer_file.get("content"))
-            org_template = tokenizer_config.chat_templates.get("default") or ""
-            org_template_tool_use = tokenizer_config.chat_templates.get("tool_use") or ""
-            org_template_rag = tokenizer_config.chat_templates.get("rag") or ""
-            # org_template_inverse = tokenizer_config.inverse_template or ""
-            tokenizer_config.chat_templates["default"] = template
-            tokenizer_config.chat_templates["tool_use"] = template_tool_use
-            tokenizer_config.chat_templates["rag"] = template_rag
-            # tokenizer_config.inverse_template = template_inverse
-            new_config = tokenizer_config.json(get_json_indent(org_config))
-            if org_config.endswith("\n"):
-                new_config += "\n"
-            changes += [
-                (token if token[1] in ("-", "+", "@") else token[1:].replace("\t", "\u21e5").replace("\r\n", "\u240d\u240a\r\n").replace("\r", "\u240d\r").replace("\n", "\u240a\n"), token[0] if token[0] != " " else None) # .replace(" ", "\u2423")
-                for token in unified_diff(new_config.splitlines(keepends = True), org_config.splitlines(keepends = True), fromfile = ModelFiles.TOKENIZER_CONFIG, tofile = ModelFiles.TOKENIZER_CONFIG)
-            ]
         tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
         org_template = tokenizer_chat_template.get("data", org_template)
@@ -766,15 +801,33 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
         operations = []
         pr_branch = branch if branch.startswith("refs/pr/") else None
-        tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
-        if org_config := tokenizer_file.get("data"):
-            tokenizer_config = TokenizerConfig(tokenizer_file.get("content"))
             tokenizer_config.chat_templates["default"] = template
             tokenizer_config.chat_templates["tool_use"] = template_tool_use
             tokenizer_config.chat_templates["rag"] = template_rag
             # tokenizer_config.inverse_template = template_inverse
             new_config = tokenizer_config.json(get_json_indent(org_config))
             if org_config.endswith("\n"):
                 new_config += "\n"
@@ -817,6 +870,10 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
         info["parent_commit"] = commit.oid
         if org_config:
             tokenizer_file["data"] = new_config
             tokenizer_file["content"] = json.loads(new_config)
@@ -950,6 +1007,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
             "disabled": info.disabled,
             "gated": info.gated,
             "private": info.private,
         }
         template_messages = example_values[0][1]
@@ -991,6 +1049,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
         progress = gr.Progress(track_tqdm = True),
         oauth_token: gr.OAuthToken | None = None,
     ):
         write_access = False
         if info and oauth_token:
@@ -1097,6 +1156,34 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
             else:
                 gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
         pr_details = None
         if branch and branch.startswith("refs/pr/"):
             pr_num = branch.split("/")[-1]
@@ -1129,9 +1216,9 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
             pr_submit: gr.Button(
                 value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
             ),
-            # chat_template: gr.skip() if ModelFiles.TOKENIZER_CHAT_TEMPLATE not in info else gr.Code(
-            #     value = info[ModelFiles.TOKENIZER_CHAT_TEMPLATE]["data"],
-            # ),
             # inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
             #     value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
             # ),
@@ -1198,7 +1285,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
             pr_preview_title,
             pr_description,
             pr_submit,
-            # chat_template,
             # inverse_template,
         ],
         show_api = False,

 class ModelFiles(StrEnum):
+    CHAT_TEMPLATE_JSON = "chat_template.json"
     TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
     TOKENIZER_CONFIG = "tokenizer_config.json"
     TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
     "Tool call with multiple responses",
     "Tool call with complex tool definition",
     "RAG call",
+    "Multimodal user message",
 ]
 example_values = [
     [
   }
 ]""",
     ],
+    [
+        "{}",
+        """[
+  {
+    "role": "user",
+    "content": [
+      {
+        "type": "text",
+        "content": "Can this animal"
+      },
+      {
+        "type": "image"
+      },
+      {
+        "type": "text",
+        "content": "make this sound"
+      },
+      {
+        "type": "audio"
+      },
+      {
+        "type": "text",
+        "content": "while moving like this?"
+      },
+      {
+        "type": "video"
+      }
+    ]
+  }
+]"""
+    ]
 ]
         org_template_tool_use = ""
         org_template_rag = ""
+        for config_file_name in (ModelFiles.CHAT_TEMPLATE_JSON, ModelFiles.TOKENIZER_CONFIG):
+            config_file = info.get(config_file_name, {})
+            org_config = config_file.get("data")
+            org_content = config_file.get("content")
+            if org_content and ("chat_template" in org_content or not org_template):
+                tokenizer_config = TokenizerConfig(org_content)
+                org_template = tokenizer_config.chat_templates.get("default") or ""
+                org_template_tool_use = tokenizer_config.chat_templates.get("tool_use") or ""
+                org_template_rag = tokenizer_config.chat_templates.get("rag") or ""
+                # org_template_inverse = tokenizer_config.inverse_template or ""
+                tokenizer_config.chat_templates["default"] = template
+                tokenizer_config.chat_templates["tool_use"] = template_tool_use
+                tokenizer_config.chat_templates["rag"] = template_rag
+                # tokenizer_config.inverse_template = template_inverse
+                new_config = tokenizer_config.json(get_json_indent(org_config))
+                if org_config.endswith("\n"):
+                    new_config += "\n"
+                changes += [
+                    (token if token[1] in ("-", "+", "@") else token[1:].replace("\t", "\u21e5").replace("\r\n", "\u240d\u240a\r\n").replace("\r", "\u240d\r").replace("\n", "\u240a\n"), token[0] if token[0] != " " else None) # .replace(" ", "\u2423")
+                    for token in unified_diff(new_config.splitlines(keepends = True), org_config.splitlines(keepends = True), fromfile = config_file_name, tofile = config_file_name)
+                ]
         tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
         org_template = tokenizer_chat_template.get("data", org_template)
         operations = []
         pr_branch = branch if branch.startswith("refs/pr/") else None
+        chat_template_file = info.get(ModelFiles.CHAT_TEMPLATE_JSON, {})
+        if org_chat_template := chat_template_file.get("data"):
+            tokenizer_config = TokenizerConfig(chat_template_file.get("content"))
             tokenizer_config.chat_templates["default"] = template
             tokenizer_config.chat_templates["tool_use"] = template_tool_use
             tokenizer_config.chat_templates["rag"] = template_rag
             # tokenizer_config.inverse_template = template_inverse
+            new_chat_template = tokenizer_config.json(get_json_indent(org_chat_template))
+            if org_chat_template.endswith("\n"):
+                new_chat_template += "\n"
+            if org_chat_template != new_chat_template:
+                operations.append(CommitOperationAdd(ModelFiles.CHAT_TEMPLATE_JSON, new_chat_template.encode("utf-8")))
+        tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
+        if org_config := tokenizer_file.get("data"):
+            tokenizer_content = tokenizer_file.get("content")
+            tokenizer_config = TokenizerConfig(tokenizer_content)
+            if "chat_template" in tokenizer_content or not chat_template_file:
+                tokenizer_config.chat_templates["default"] = template
+                tokenizer_config.chat_templates["tool_use"] = template_tool_use
+                tokenizer_config.chat_templates["rag"] = template_rag
+                # tokenizer_config.inverse_template = template_inverse
             new_config = tokenizer_config.json(get_json_indent(org_config))
             if org_config.endswith("\n"):
                 new_config += "\n"
         info["parent_commit"] = commit.oid
+        if org_chat_template:
+            chat_template_file["data"] = new_chat_template
+            chat_template_file["content"] = json.loads(new_chat_template)
         if org_config:
             tokenizer_file["data"] = new_config
             tokenizer_file["content"] = json.loads(new_config)
             "disabled": info.disabled,
             "gated": info.gated,
             "private": info.private,
+            "chat_template": templates,
         }
         template_messages = example_values[0][1]
         progress = gr.Progress(track_tqdm = True),
         oauth_token: gr.OAuthToken | None = None,
     ):
+        parent_commit = None
         write_access = False
         if info and oauth_token:
             else:
                 gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
+        if not info.get("chat_template"):
+            try:
+                chat_template_file = None
+                if (hfapi.file_exists(
+                    repo,
+                    ModelFiles.CHAT_TEMPLATE_JSON,
+                    revision = branch,
+                    token = oauth_token.token if oauth_token else False,
+                )):
+                    chat_template_file = hfapi.hf_hub_download(
+                        repo,
+                        ModelFiles.CHAT_TEMPLATE_JSON,
+                        revision = parent_commit or branch,
+                        token = oauth_token.token if oauth_token else False,
+                    )
+            except Exception as e:
+                pass
+            else:
+                if chat_template_file:
+                    with open(chat_template_file, "r", encoding = "utf-8") as fp:
+                        template_data = fp.read()
+                        template_content = json.loads(template_data)
+                        info[ModelFiles.CHAT_TEMPLATE_JSON] = {
+                            "data": template_data,
+                            "content": template_content,
+                        }
+                        info["chat_template"] = template_content.get("chat_template")
         pr_details = None
         if branch and branch.startswith("refs/pr/"):
             pr_num = branch.split("/")[-1]
             pr_submit: gr.Button(
                 value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
             ),
+            chat_template: gr.skip() if ModelFiles.CHAT_TEMPLATE_JSON not in info else gr.Code(
+                value = TokenizerConfig(info[ModelFiles.CHAT_TEMPLATE_JSON]["content"]).chat_templates.get("default"),
+            ),
             # inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
             #     value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
             # ),
             pr_preview_title,
             pr_description,
             pr_submit,
+            chat_template,
             # inverse_template,
         ],
         show_api = False,