litagin commited on
Commit
5c91bae
·
1 Parent(s): 07c2d2e
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +22 -63
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Galgame Whisper (WIP) Demo
3
  emoji: 🥰🎤📝
4
  colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.0.2
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: Anime Whisper Demo
3
  emoji: 🥰🎤📝
4
  colorFrom: blue
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 5.5.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -28,8 +28,6 @@ generate_kwargs = {
28
  model_dict = {
29
  "whisper-large-v2": "openai/whisper-large-v2",
30
  "whisper-large-v3": "openai/whisper-large-v3",
31
- "whisper-large-v3-turbo": "openai/whisper-large-v3-turbo",
32
- "kotoba-whisper-v1.0": "kotoba-tech/kotoba-whisper-v1.0",
33
  "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
34
  "anime-whisper": "litagin/anime-whisper",
35
  }
@@ -47,9 +45,9 @@ logger.success("Pipelines initialized!")
47
 
48
 
49
  @spaces.GPU
50
- def transcribe_common(audio: str, model: str) -> tuple[str, float]:
51
  if not audio:
52
- return "No audio file", 0
53
  filename = Path(audio).name
54
  logger.info(f"Model: {model}")
55
  logger.info(f"Audio: {filename}")
@@ -60,35 +58,22 @@ def transcribe_common(audio: str, model: str) -> tuple[str, float]:
60
  logger.info(f"Duration: {duration:.2f}s")
61
  if duration > 15:
62
  logger.error(f"Audio too long, limit is 15 seconds, got {duration:.2f}s")
63
- return f"Audio too long, limit is 15 seconds, got {duration:.2f}s", 0
64
  start_time = time.time()
65
  result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
66
  end_time = time.time()
67
  logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
68
- return result, end_time - start_time
69
 
70
 
71
- def transcribe_large_v2(audio) -> tuple[str, float]:
72
- return transcribe_common(audio, "whisper-large-v2")
 
 
 
73
 
74
 
75
- def transcribe_large_v3(audio) -> tuple[str, float]:
76
- return transcribe_common(audio, "whisper-large-v3")
77
-
78
-
79
- def transcribe_large_v3_turbo(audio) -> tuple[str, float]:
80
- return transcribe_common(audio, "whisper-large-v3-turbo")
81
-
82
-
83
- def transcribe_kotoba_v1(audio) -> tuple[str, float]:
84
- return transcribe_common(audio, "kotoba-whisper-v1.0")
85
-
86
-
87
- def transcribe_kotoba_v2(audio) -> tuple[str, float]:
88
- return transcribe_common(audio, "kotoba-whisper-v2.0")
89
-
90
-
91
- def transcribe_anime_whisper(audio) -> tuple[str, float]:
92
  return transcribe_common(audio, "anime-whisper")
93
 
94
 
@@ -99,17 +84,16 @@ initial_md = """
99
  - https://huggingface.co/litagin/anime-whisper
100
  - デモでは**音声は15秒まで**しか受け付けません
101
  - 日本語のみ対応 (Japanese only)
102
- - 現在0.1エポックくらい
103
- - 比較できるように他モデルもついでに試せる
104
 
105
  pipeに渡しているkwargsは以下の最低限のもの:
106
  ```python
107
  generate_kwargs = {
108
  "language": "Japanese",
109
  "do_sample": False,
110
- "num_beams": 1,
111
  "no_repeat_ngram_size": 0,
112
- "max_new_tokens": 64,
113
  }
114
  ```
115
  """
@@ -121,54 +105,29 @@ with gr.Blocks() as app:
121
  with gr.Column():
122
  gr.Markdown("### Anime-Whisper")
123
  button_galgame = gr.Button("Transcribe with Anime-Whisper")
124
- time_galgame = gr.Textbox(label="Time taken")
125
  output_galgame = gr.Textbox(label="Result")
126
  with gr.Row():
 
 
127
  with gr.Column():
128
  gr.Markdown("### Whisper-Large-V2")
129
- button_v2 = gr.Button("Transcribe with Whisper-Large-V2")
130
- time_v2 = gr.Textbox(label="Time taken")
131
  output_v2 = gr.Textbox(label="Result")
132
  with gr.Column():
133
  gr.Markdown("### Whisper-Large-V3")
134
- button_v3 = gr.Button("Transcribe with Whisper-Large-V3")
135
- time_v3 = gr.Textbox(label="Time taken")
136
  output_v3 = gr.Textbox(label="Result")
137
- with gr.Column():
138
- gr.Markdown("### Whisper-Large-V3-Turbo")
139
- button_v3_turbo = gr.Button("Transcribe with Whisper-Large-V3-Turbo")
140
- time_v3_turbo = gr.Textbox(label="Time taken")
141
- output_v3_turbo = gr.Textbox(label="Result")
142
- with gr.Row():
143
- with gr.Column():
144
- gr.Markdown("### Kotoba-Whisper-V1.0")
145
- button_kotoba_v1 = gr.Button("Transcribe with Kotoba-Whisper-V1.0")
146
- time_kotoba_v1 = gr.Textbox(label="Time taken")
147
- output_kotoba_v1 = gr.Textbox(label="Result")
148
  with gr.Column():
149
  gr.Markdown("### Kotoba-Whisper-V2.0")
150
- button_kotoba_v2 = gr.Button("Transcribe with Kotoba-Whisper-V2.0")
151
- time_kotoba_v2 = gr.Textbox(label="Time taken")
152
  output_kotoba_v2 = gr.Textbox(label="Result")
153
 
154
- button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
155
- button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
156
- button_v3_turbo.click(
157
- transcribe_large_v3_turbo,
158
- inputs=audio,
159
- outputs=[output_v3_turbo, time_v3_turbo],
160
- )
161
- button_kotoba_v1.click(
162
- transcribe_kotoba_v1, inputs=audio, outputs=[output_kotoba_v1, time_kotoba_v1]
163
- )
164
- button_kotoba_v2.click(
165
- transcribe_kotoba_v2, inputs=audio, outputs=[output_kotoba_v2, time_kotoba_v2]
166
- )
167
  button_galgame.click(
168
  transcribe_anime_whisper,
169
- inputs=audio,
170
- outputs=[output_galgame, time_galgame],
 
 
 
 
 
171
  )
172
 
173
- # app.load(warmup, inputs=[], outputs=[warmup_result], queue=True)
174
  app.launch(inbrowser=True)
 
28
  model_dict = {
29
  "whisper-large-v2": "openai/whisper-large-v2",
30
  "whisper-large-v3": "openai/whisper-large-v3",
 
 
31
  "kotoba-whisper-v2.0": "kotoba-tech/kotoba-whisper-v2.0",
32
  "anime-whisper": "litagin/anime-whisper",
33
  }
 
45
 
46
 
47
  @spaces.GPU
48
+ def transcribe_common(audio: str, model: str) -> str:
49
  if not audio:
50
+ return "No audio file"
51
  filename = Path(audio).name
52
  logger.info(f"Model: {model}")
53
  logger.info(f"Audio: {filename}")
 
58
  logger.info(f"Duration: {duration:.2f}s")
59
  if duration > 15:
60
  logger.error(f"Audio too long, limit is 15 seconds, got {duration:.2f}s")
61
+ return f"Audio too long, limit is 15 seconds, got {duration:.2f}s"
62
  start_time = time.time()
63
  result = pipe_dict[model](y, generate_kwargs=generate_kwargs)["text"]
64
  end_time = time.time()
65
  logger.success(f"Finished in {end_time - start_time:.2f}s\n{result}")
66
+ return result
67
 
68
 
69
+ def transcribe_others(audio) -> tuple[str, str, str]:
70
+ result_v2 = transcribe_common(audio, "whisper-large-v2")
71
+ result_v3 = transcribe_common(audio, "whisper-large-v3")
72
+ result_kotoba_v2 = transcribe_common(audio, "kotoba-whisper-v2.0")
73
+ return result_v2, result_v3, result_kotoba_v2
74
 
75
 
76
+ def transcribe_anime_whisper(audio) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  return transcribe_common(audio, "anime-whisper")
78
 
79
 
 
84
  - https://huggingface.co/litagin/anime-whisper
85
  - デモでは**音声は15秒まで**しか受け付けません
86
  - 日本語のみ対応 (Japanese only)
87
+ - 比較のために [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2) と [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) と [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0) も用意しています
 
88
 
89
  pipeに渡しているkwargsは以下の最低限のもの:
90
  ```python
91
  generate_kwargs = {
92
  "language": "Japanese",
93
  "do_sample": False,
94
+ "num_beams": 1,[openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3)
95
  "no_repeat_ngram_size": 0,
96
+ "max_new_tokens": 64, # 結果が長いときは途中で打ち切る
97
  }
98
  ```
99
  """
 
105
  with gr.Column():
106
  gr.Markdown("### Anime-Whisper")
107
  button_galgame = gr.Button("Transcribe with Anime-Whisper")
 
108
  output_galgame = gr.Textbox(label="Result")
109
  with gr.Row():
110
+ gr.Markdown("### Comparison")
111
+ button_others = gr.Button("Transcribe with other models")
112
  with gr.Column():
113
  gr.Markdown("### Whisper-Large-V2")
 
 
114
  output_v2 = gr.Textbox(label="Result")
115
  with gr.Column():
116
  gr.Markdown("### Whisper-Large-V3")
 
 
117
  output_v3 = gr.Textbox(label="Result")
 
 
 
 
 
 
 
 
 
 
 
118
  with gr.Column():
119
  gr.Markdown("### Kotoba-Whisper-V2.0")
 
 
120
  output_kotoba_v2 = gr.Textbox(label="Result")
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  button_galgame.click(
123
  transcribe_anime_whisper,
124
+ inputs=[audio],
125
+ outputs=[output_galgame],
126
+ )
127
+ button_others.click(
128
+ transcribe_others,
129
+ inputs=[audio],
130
+ outputs=[output_v2, output_v3, output_kotoba_v2],
131
  )
132
 
 
133
  app.launch(inbrowser=True)