gorkemgoknar commited on
Commit
0a0b1ab
·
1 Parent(s): f31f07e

improvements

Browse files
Files changed (1) hide show
  1. app.py +96 -64
app.py CHANGED
@@ -11,8 +11,9 @@ import gradio as gr
11
  import numpy as np
12
  import torch
13
  import nltk # we'll use this to split into sentences
14
-
15
  nltk.download("punkt")
 
 
16
  import uuid
17
 
18
  import datetime
@@ -33,9 +34,10 @@ from TTS.utils.generic_utils import get_user_data_dir
33
  # For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
34
  # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
35
  AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
36
-
37
  # if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
38
  DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
 
39
 
40
  # This will trigger downloading model
41
  print("Downloading if not downloaded Coqui XTTS V1")
@@ -73,7 +75,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
73
  # will use api to restart space on a unrecoverable error
74
  api = HfApi(token=HF_TOKEN)
75
 
76
- repo_id = "ylacombe/voice-chat-with-mistral"
77
 
78
  default_system_message = """
79
  You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
@@ -94,6 +96,7 @@ system_understand_message = os.environ.get(
94
  "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
95
  )
96
 
 
97
 
98
  temperature = 0.9
99
  top_p = 0.6
@@ -157,9 +160,28 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
157
  wav_buf.seek(0)
158
  return wav_buf.read()
159
 
160
-
161
  def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
162
  gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  try:
164
  t0 = time.time()
165
  chunks = model.inference_stream(
@@ -381,7 +403,7 @@ def get_sentence(history, system_prompt=""):
381
  #### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
382
 
383
  def generate_speech(history):
384
- language = "en"
385
 
386
  wav_bytestream = b""
387
  for sentence, history in get_sentence(history):
@@ -403,65 +425,75 @@ def generate_speech(history):
403
  print("Sentence for speech:", sentence)
404
 
405
  try:
406
- #TODO this will be better handled in future using textwrap
407
- if len(sentence) > 300:
408
- gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
409
- # should not generate voice it will hit token limit
410
- # It should not generate audio for it
411
- audio_stream = None
412
  else:
413
- audio_stream = get_voice_streaming(
414
- sentence, language, latent_map["Female_Voice"]
415
- )
416
- # XTTS is actually using streaming response but we are playing audio by sentence
417
- # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
418
- if audio_stream is not None:
419
- wav_chunks = wave_header_chunk()
420
- frame_length = 0
421
- for chunk in audio_stream:
422
- try:
423
- wav_bytestream += chunk
424
- if DIRECT_STREAM:
425
- yield (
426
- gr.Audio.update(
427
- value=wave_header_chunk() + chunk, autoplay=True
428
- ),
429
- history,
430
- )
431
- wait_time = len(chunk) / 2 / 24000
432
- wait_time = AUDIO_WAIT_MODIFIER * wait_time
433
- print("Sleeping till chunk end")
434
- time.sleep(wait_time)
435
-
436
- else:
437
- wav_chunks += chunk
438
- frame_length += len(chunk)
439
- except:
440
- # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
441
- continue
442
-
443
- if not DIRECT_STREAM:
444
- yield (
445
- gr.Audio.update(value=None, autoplay=True),
446
- history,
447
- ) # hack to switch autoplay
448
- if audio_stream is not None:
449
- yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
450
- # Streaming wait time calculation
451
- # audio_length = frame_length / sample_width/ frame_rate
452
- wait_time = frame_length / 2 / 24000
453
-
454
- # for non streaming
455
- # wait_time= librosa.get_duration(path=wav)
456
-
457
- wait_time = AUDIO_WAIT_MODIFIER * wait_time
458
- print("Sleeping till audio end")
459
- time.sleep(wait_time)
460
  else:
461
- # Either too much text or some programming, give a silence so stream continues
462
- second_of_silence = AudioSegment.silent() # use default
463
- second_of_silence.export("sil.wav", format="wav")
464
- yield (gr.Audio.update(value="sil.wav", autoplay=True), history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
 
466
  except RuntimeError as e:
467
  if "device-side assert" in str(e):
@@ -479,7 +511,7 @@ def generate_speech(history):
479
  print("RuntimeError: non device-side assert error:", str(e))
480
  raise e
481
 
482
- time.sleep(1.0)
483
  wav_bytestream = wave_header_chunk() + wav_bytestream
484
  outfile = "combined.wav"
485
  with open(outfile, "wb") as f:
@@ -495,7 +527,7 @@ with gr.Blocks(title=title) as demo:
495
  chatbot = gr.Chatbot(
496
  [],
497
  elem_id="chatbot",
498
- avatar_images=("examples/hf-logo.png", "examples/coqui-logo.png"),
499
  bubble_full_width=False,
500
  )
501
 
 
11
  import numpy as np
12
  import torch
13
  import nltk # we'll use this to split into sentences
 
14
  nltk.download("punkt")
15
+
16
+ import langid
17
  import uuid
18
 
19
  import datetime
 
34
  # For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
35
  # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
36
  AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
37
+ print("AUDIO_WAIT_MODIFIER set to",AUDIO_WAIT_MODIFIER)
38
  # if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
39
  DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
40
+ print("DIRECT_STREAM set to",DIRECT_STREAM)
41
 
42
  # This will trigger downloading model
43
  print("Downloading if not downloaded Coqui XTTS V1")
 
75
  # will use api to restart space on a unrecoverable error
76
  api = HfApi(token=HF_TOKEN)
77
 
78
+ repo_id = "coqui/voice-chat-with-mistral"
79
 
80
  default_system_message = """
81
  You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
 
96
  "SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
97
  )
98
 
99
+ print("Mistral system message set as:", default_system_message)
100
 
101
  temperature = 0.9
102
  top_p = 0.6
 
160
  wav_buf.seek(0)
161
  return wav_buf.read()
162
 
163
+ xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
164
  def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
165
  gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
166
+
167
+ # Fast language autodetection
168
+ if len(prompt)>15 and language=="autodetect":
169
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
170
+ if language_predicted == "zh":
171
+ #we use zh-cn on xtts
172
+ language_predicted = "zh-cn"
173
+ if language_predicted not in xtts_supported_languages:
174
+ print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
175
+ gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
176
+ language= "en"
177
+ else:
178
+ language = language_predicted
179
+ print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
180
+ else:
181
+ # Hard to detect language fast in short sentence, use english default
182
+ language = "en"
183
+ print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
184
+
185
  try:
186
  t0 = time.time()
187
  chunks = model.inference_stream(
 
403
  #### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
404
 
405
  def generate_speech(history):
406
+ language = "autodetect"
407
 
408
  wav_bytestream = b""
409
  for sentence, history in get_sentence(history):
 
425
  print("Sentence for speech:", sentence)
426
 
427
  try:
428
+ if len(sentence)<300:
429
+ # no problem continue on
430
+ sentence_list = [sentence]
 
 
 
431
  else:
432
+ # Until now nltk likely split sentences properly but we need additional
433
+ # check for longer sentence and split at last possible position
434
+ # Do whatever necessary, first break at hypens then spaces and then even split very long words
435
+ sentence_list=textwrap(sentence,300)
436
+ print("SPLITTED LONG SENTENCE:",sentence_list)
437
+
438
+ for sentence in sentence_list:
439
+ if any(c.isalnum() for c in sentence):
440
+ #exists at least 1 alphanumeric (utf-8)
441
+ audio_stream = get_voice_streaming(
442
+ sentence, language, latent_map["Female_Voice"]
443
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  else:
445
+ # likely got a ' or " or some other text without alphanumeric in it
446
+ audio_stream = None
447
+
448
+ # XTTS is actually using streaming response but we are playing audio by sentence
449
+ # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
450
+ if audio_stream is not None:
451
+ wav_chunks = wave_header_chunk()
452
+ frame_length = 0
453
+ for chunk in audio_stream:
454
+ try:
455
+ wav_bytestream += chunk
456
+ if DIRECT_STREAM:
457
+ yield (
458
+ gr.Audio.update(
459
+ value=wave_header_chunk() + chunk, autoplay=True
460
+ ),
461
+ history,
462
+ )
463
+ wait_time = len(chunk) / 2 / 24000
464
+ wait_time = AUDIO_WAIT_MODIFIER * wait_time
465
+ print("Sleeping till chunk end")
466
+ time.sleep(wait_time)
467
+
468
+ else:
469
+ wav_chunks += chunk
470
+ frame_length += len(chunk)
471
+ except:
472
+ # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
473
+ continue
474
+
475
+ if not DIRECT_STREAM:
476
+ yield (
477
+ gr.Audio.update(value=None, autoplay=True),
478
+ history,
479
+ ) # hack to switch autoplay
480
+ if audio_stream is not None:
481
+ yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
482
+ # Streaming wait time calculation
483
+ # audio_length = frame_length / sample_width/ frame_rate
484
+ wait_time = frame_length / 2 / 24000
485
+
486
+ # for non streaming
487
+ # wait_time= librosa.get_duration(path=wav)
488
+
489
+ wait_time = AUDIO_WAIT_MODIFIER * wait_time
490
+ print("Sleeping till audio end")
491
+ time.sleep(wait_time)
492
+ else:
493
+ # Either too much text or some programming, give a silence so stream continues
494
+ second_of_silence = AudioSegment.silent() # use default
495
+ second_of_silence.export("sil.wav", format="wav")
496
+ yield (gr.Audio.update(value="sil.wav", autoplay=True), history)
497
 
498
  except RuntimeError as e:
499
  if "device-side assert" in str(e):
 
511
  print("RuntimeError: non device-side assert error:", str(e))
512
  raise e
513
 
514
+ time.sleep(1.5)
515
  wav_bytestream = wave_header_chunk() + wav_bytestream
516
  outfile = "combined.wav"
517
  with open(outfile, "wb") as f:
 
527
  chatbot = gr.Chatbot(
528
  [],
529
  elem_id="chatbot",
530
+ avatar_images=("examples/mirror.png", "examples/coqui-logo.png"),
531
  bubble_full_width=False,
532
  )
533