litagin commited on
Commit
eefa888
·
1 Parent(s): 4d2d3bd

Change to use anime-whisper

Browse files
Files changed (1) hide show
  1. app.py +15 -23
app.py CHANGED
@@ -16,9 +16,9 @@ is_hf = os.getenv("SYSTEM") == "spaces"
16
 
17
  generate_kwargs = {
18
  "language": "Japanese",
19
- # "do_sample": False,
20
- # "num_beams": 1,
21
- # "no_repeat_ngram_size": 3,
22
  "max_new_tokens": 64,
23
  }
24
 
@@ -29,11 +29,7 @@ model_dict = {
29
  "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
30
  "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
31
  "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
32
- "galgame-whisper-wip": (
33
- "litagin/galgame-whisper-wip"
34
- if is_hf
35
- else "../whisper_finetune/galgame-whisper"
36
- ),
37
  }
38
 
39
  logger.info("Initializing pipelines...")
@@ -90,20 +86,15 @@ def transcribe_kotoba_v2(audio) -> tuple[str, float]:
90
  return transcribe_common(audio, "kotoba-whisper-v2.0")
91
 
92
 
93
- def transcribe_galgame_whisper(audio) -> tuple[str, float]:
94
- return transcribe_common(audio, "galgame-whisper-wip")
95
-
96
-
97
- # def warmup():
98
- # logger.info("Warm-up...")
99
- # return transcribe_large_v3_turbo("test.wav")
100
 
101
 
102
  initial_md = """
103
- # Galgame-Whisper (WIP) Demo
104
 
105
- - 音声認識モデル [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) をファインチューンした**未完成のモデル**のお試し
106
- - https://huggingface.co/litagin/galgame-whisper-wip
107
  - デモでは**音声は15秒まで**しか受け付けません
108
  - 日本語のみ対応 (Japanese only)
109
  - 現在0.1エポックくらい
@@ -113,6 +104,9 @@ pipeに渡しているkwargsは以下の最低限のもの:
113
  ```python
114
  generate_kwargs = {
115
  "language": "Japanese",
 
 
 
116
  "max_new_tokens": 64,
117
  }
118
  ```
@@ -123,8 +117,8 @@ with gr.Blocks() as app:
123
  audio = gr.Audio(type="filepath")
124
  with gr.Row():
125
  with gr.Column():
126
- gr.Markdown("### Galgame-Whisper (WIP)")
127
- button_galgame = gr.Button("Transcribe with Galgame-Whisper (WIP)")
128
  time_galgame = gr.Textbox(label="Time taken")
129
  output_galgame = gr.Textbox(label="Result")
130
  with gr.Row():
@@ -155,8 +149,6 @@ with gr.Blocks() as app:
155
  time_kotoba_v2 = gr.Textbox(label="Time taken")
156
  output_kotoba_v2 = gr.Textbox(label="Result")
157
 
158
- # warmup_result = gr.Textbox(label="Warm-up result", visible=False)
159
-
160
  button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
161
  button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
162
  button_v3_turbo.click(
@@ -171,7 +163,7 @@ with gr.Blocks() as app:
171
  transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
172
  )
173
  button_galgame.click(
174
- transcribe_galgame_whisper,
175
  inputs=audio,
176
  outputs=[output_galgame, time_galgame],
177
  )
 
16
 
17
  generate_kwargs = {
18
  "language": "Japanese",
19
+ "do_sample": False,
20
+ "num_beams": 1,
21
+ "no_repeat_ngram_size": 0,
22
  "max_new_tokens": 64,
23
  }
24
 
 
29
  "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
30
  "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
31
  "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
32
+ "anime-whisper": "litagin/anime-whisper",
 
 
 
 
33
  }
34
 
35
  logger.info("Initializing pipelines...")
 
86
  return transcribe_common(audio, "kotoba-whisper-v2.0")
87
 
88
 
89
+ def transcribe_anime_whisper(audio) -> tuple[str, float]:
90
+ return transcribe_common(audio, "anime-whisper")
 
 
 
 
 
91
 
92
 
93
  initial_md = """
94
+ # Anime-Whisper Demo
95
 
96
+ - 音声認識モデル [kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) をファインチューンしたモデルのお試し
97
+ - https://huggingface.co/litagin/anime-whisper
98
  - デモでは**音声は15秒まで**しか受け付けません
99
  - 日本語のみ対応 (Japanese only)
100
  - 現在0.1エポックくらい
 
104
  ```python
105
  generate_kwargs = {
106
  "language": "Japanese",
107
+ "do_sample": False,
108
+ "num_beams": 1,
109
+ "no_repeat_ngram_size": 0,
110
  "max_new_tokens": 64,
111
  }
112
  ```
 
117
  audio = gr.Audio(type="filepath")
118
  with gr.Row():
119
  with gr.Column():
120
+ gr.Markdown("### Anime-Whisper")
121
+ button_galgame = gr.Button("Transcribe with Anime-Whisper")
122
  time_galgame = gr.Textbox(label="Time taken")
123
  output_galgame = gr.Textbox(label="Result")
124
  with gr.Row():
 
149
  time_kotoba_v2 = gr.Textbox(label="Time taken")
150
  output_kotoba_v2 = gr.Textbox(label="Result")
151
 
 
 
152
  button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
153
  button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
154
  button_v3_turbo.click(
 
163
  transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
164
  )
165
  button_galgame.click(
166
+ transcribe_anime_whisper,
167
  inputs=audio,
168
  outputs=[output_galgame, time_galgame],
169
  )