Spaces:

litagin
/

anime-whisper-demo

Running on Zero

App Files Files Community

litagin commited on Nov 10, 2024

Commit

eefa888

1 Parent(s): 4d2d3bd

Change to use anime-whisper

Browse files

Files changed (1) hide show

app.py +15 -23

app.py CHANGED Viewed

@@ -16,9 +16,9 @@ is_hf = os.getenv("SYSTEM") == "spaces"
 generate_kwargs = {
     "language": "Japanese",
-    # "do_sample": False,
-    # "num_beams": 1,
-    # "no_repeat_ngram_size": 3,
     "max_new_tokens": 64,
 }
@@ -29,11 +29,7 @@ model_dict = {
     "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
     "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
     "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
-    "galgame-whisper-wip": (
-        "litagin/galgame-whisper-wip"
-        if is_hf
-        else "../whisper_finetune/galgame-whisper"
-    ),
 }
 logger.info("Initializing pipelines...")
@@ -90,20 +86,15 @@ def transcribe_kotoba_v2(audio) -> tuple[str, float]:
     return transcribe_common(audio, "kotoba-whisper-v2.0")
-def transcribe_galgame_whisper(audio) -> tuple[str, float]:
-    return transcribe_common(audio, "galgame-whisper-wip")
-# def warmup():
-#     logger.info("Warm-up...")
-#     return transcribe_large_v3_turbo("test.wav")
 initial_md = """
-# Galgame-Whisper (WIP) Demo
-- 音声認識モデル [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) をファインチューンした**未完成のモデル**のお試し
-- https://huggingface.co/litagin/galgame-whisper-wip
 - デモでは**音声は15秒まで**しか受け付けません
 - 日本語のみ対応 (Japanese only)
 - 現在0.1エポックくらい
@@ -113,6 +104,9 @@ pipeに渡しているkwargsは以下の最低限のもの:
 ```python
 generate_kwargs = {
     "language": "Japanese",
     "max_new_tokens": 64,
 }
 ```
@@ -123,8 +117,8 @@ with gr.Blocks() as app:
     audio = gr.Audio(type="filepath")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Galgame-Whisper (WIP)")
-            button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
             time_galgame = gr.Textbox(label="Time taken")
             output_galgame = gr.Textbox(label="Result")
     with gr.Row():
@@ -155,8 +149,6 @@ with gr.Blocks() as app:
             time_kotoba_v2 = gr.Textbox(label="Time taken")
             output_kotoba_v2 = gr.Textbox(label="Result")
-    # warmup_result = gr.Textbox(label="Warm-up result", visible=False)
     button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
     button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
     button_v3_turbo.click(
@@ -171,7 +163,7 @@ with gr.Blocks() as app:
         transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
     )
     button_galgame.click(
-        transcribe_galgame_whisper,
         inputs=audio,
         outputs=[output_galgame, time_galgame],
     )

 generate_kwargs = {
     "language": "Japanese",
+    "do_sample": False,
+    "num_beams": 1,
+    "no_repeat_ngram_size": 0,
     "max_new_tokens": 64,
 }
     "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
     "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
     "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
+    "anime-whisper": "litagin/anime-whisper",
 }
 logger.info("Initializing pipelines...")
     return transcribe_common(audio, "kotoba-whisper-v2.0")
+def transcribe_anime_whisper(audio) -> tuple[str, float]:
+    return transcribe_common(audio, "anime-whisper")
 initial_md = """
+# Anime-Whisper Demo
+- 音声認識モデル [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) をファインチューンしたモデルのお試し
+- https://huggingface.co/litagin/anime-whisper
 - デモでは**音声は15秒まで**しか受け付けません
 - 日本語のみ対応 (Japanese only)
 - 現在0.1エポックくらい
 ```python
 generate_kwargs = {
     "language": "Japanese",
+    "do_sample": False,
+    "num_beams": 1,
+    "no_repeat_ngram_size": 0,
     "max_new_tokens": 64,
 }
 ```
     audio = gr.Audio(type="filepath")
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### Anime-Whisper")
+            button_galgame = gr.Button("Transcribe with Anime-Whisper")
             time_galgame = gr.Textbox(label="Time taken")
             output_galgame = gr.Textbox(label="Result")
     with gr.Row():
             time_kotoba_v2 = gr.Textbox(label="Time taken")
             output_kotoba_v2 = gr.Textbox(label="Result")
     button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
     button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
     button_v3_turbo.click(
         transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
     )
     button_galgame.click(
+        transcribe_anime_whisper,
         inputs=audio,
         outputs=[output_galgame, time_galgame],
     )