fx audiobook api

Browse files

Files changed (5) hide show

README.md +3 -4
text_utils.py → Utils/text_utils.py +1 -2
api.py +22 -18
audiobook.py +119 -151
msinference.py +40 -36

README.md CHANGED Viewed

@@ -18,7 +18,7 @@ tags:
 # Affective TTS / Soundscape
-This repo is an expansion of the [SHIFT TTS tool](https://github.com/audeering/shift) with [foreign langs](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv) and audio sound generation via [AudioGen](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3).
   - Analysis of emotions of TTS [#1](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
   - `landscape2soundscape.py` generates soundscape, i.e. `trees, water, ..` & overlays TTS & creates video from image.
@@ -117,10 +117,9 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
 # Audiobook
-Convert `.docx` to audio `.wav` & `.mp4`. Via multiple voices. Listen to it in YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
 ```python
-# uses Flask api.py
-# download shift/assets/INCLUSION_IN_MUSEUMS_audiobook.docx
 python audiobook.py
 ```

 # Affective TTS / Soundscape
+Expansion of the [SHIFT TTS tool](https://github.com/audeering/shift) with [foreign langs](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv) and audio soundscape via [AudioGen](https://huggingface.co/dkounadis/artificial-styletts2/discussions/3).
   - Analysis of emotions of TTS [#1](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
   - `landscape2soundscape.py` generates soundscape, i.e. `trees, water, ..` & overlays TTS & creates video from image.
 # Audiobook
+Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
 ```python
+# generated audiobook will be saved in ./tts_audiobooks
 python audiobook.py
 ```

text_utils.py → Utils/text_utils.py RENAMED Viewed

@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-import numpy as np
 import re
 import codecs
 import textwrap
@@ -118,4 +117,4 @@ def store_ssml(text=None,
     print(len(text),'\n\n\n\n\n\n\n', _s)
     with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
-        f.write(_s)

 # -*- coding: utf-8 -*-
 import re
 import codecs
 import textwrap
     print(len(text),'\n\n\n\n\n\n\n', _s)
     with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
+        f.write(_s)

api.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import numpy as np
 import soundfile
 import audresample
-import text_utils
 import msinference
 import re
 import srt
@@ -35,7 +35,8 @@ nltk.download('punkt')
 #   git remote set-url origin [email protected]:audeering/shift
 # ==
 def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
     '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
@@ -144,19 +145,22 @@ def tts_multi_sentence(precomputed_style_vector=None,
                                     beta=0.7,
                                     diffusion_steps=7,
                                     embedding_scale=1))
-    # Fallback - MMS TTS - Non-English Foreign voice=language
-    else:
-        x = []
-        for _sentence in text:
-            x.append(msinference.foreign(text=_sentence,
-                                    lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
-                                    speed=speed))
-    x = np.concatenate(x)
-    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
     return overlay(x, scene=scene)
@@ -185,15 +189,15 @@ def serve_wav():
     # Physically Save Client Files
     for filename, obj in request.files.items():
-        obj.save(f'{CACHE_DIR}{filename.replace("/","")}')
     print('Saved all files on Server Side\n\n')
     args = SimpleNamespace(
-        text      = None if r.get('text')  is None else CACHE_DIR + r.get('text' )[0][-6:],
-        video     = None if r.get('video') is None else CACHE_DIR + r.get('video')[0][-6:],
-        image     = None if r.get('image') is None else CACHE_DIR + r.get('image')[0][-6:],
-        native    = None if r.get('native') is None else CACHE_DIR + r.get('native')[0][-6:],
         affective =       r.get('affective')[0],
         voice     =       r.get('voice')[0],
         speed     = float(r.get('speed')[0]),  # For Non-English MMS TTS
@@ -233,7 +237,7 @@ def serve_wav():
         with open(args.text, 'r') as f:
             t = ''.join(f)
         t = re.sub(' +', ' ', t)  # delete spaces
-        text = text_utils.split_into_sentences(t)  # split to short sentences (~200 phonemes max)
     # ====STYLE VECTOR====

 import numpy as np
 import soundfile
 import audresample
+from Utils.text_utils import split_into_sentences
 import msinference
 import re
 import srt
 #   git remote set-url origin [email protected]:audeering/shift
 # ==
+def _shorten(filename):
+    return filename.replace("/","")[-6:]
 def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
     '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
                                     beta=0.7,
                                     diffusion_steps=7,
                                     embedding_scale=1))
+        x = np.concatenate(x)
+    # Fallback - MMS TTS - Non-English
+    else:
+        # dont split foreign sentences: Avoids re-load of VITS & random speaker change issue
+        x = msinference.foreign(text=text,
+                                lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
+                                speed=speed)  # normalisation externally
+    # volume
+    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
     return overlay(x, scene=scene)
     # Physically Save Client Files
     for filename, obj in request.files.items():
+        obj.save(f'{CACHE_DIR}{_shorten(filename)}')
     print('Saved all files on Server Side\n\n')
     args = SimpleNamespace(
+        text      = None if r.get('text')  is None else CACHE_DIR + _shorten(r.get('text' )[0]),  # crop last letters from original filename & use as tmp
+        video     = None if r.get('video') is None else CACHE_DIR + _shorten(r.get('video')[0]),
+        image     = None if r.get('image') is None else CACHE_DIR + _shorten(r.get('image')[0]),
+        native    = None if r.get('native') is None else CACHE_DIR + _shorten(r.get('native')[0]),
         affective =       r.get('affective')[0],
         voice     =       r.get('voice')[0],
         speed     = float(r.get('speed')[0]),  # For Non-English MMS TTS
         with open(args.text, 'r') as f:
             t = ''.join(f)
         t = re.sub(' +', ' ', t)  # delete spaces
+        text = split_into_sentences(t)  # split to short sentences (~100 phonemes max for OOM)
     # ====STYLE VECTOR====

audiobook.py CHANGED Viewed

@@ -1,22 +1,15 @@
-# FOR EACH VOICE -> create .wav file per chapter & full audiobook.wav from assets/INCLUSION_IN_MUSEUMS_audiobook.docx
-#
-# Chapters
-#
-#   ROOT_DIR/voice/voxstr_CHAPTER_0.wav
-#     ..
-#   ROOT_DIR/voice/voxstr_CHAPTER_10.wav
-#   ROOT_DIR/voice/voxstr_full_book.wav
-#
-# Full AudioBook
-#
-#   ROOT_DIR/full_audiobook_all_voices.wav
 import cv2
 import subprocess
 import numpy as np
 import soundfile
-import docx  # pip install python-docx
 from pathlib import Path
 from moviepy.editor import *
@@ -25,98 +18,95 @@ ROOT_DIR = './tts_audiobooks/voices/'
 Path(ROOT_DIR).mkdir(parents=True,
                      exist_ok=True)
 voices = [
-        # 'en_US/hifi-tts_low#9017' ,
-        'en_US/m-ailabs_low#mary_ann',
-        'en_US/cmu-arctic_low#jmk',
-        # 'en_US/cmu-arctic_low#eey',
-        'en_UK/apope_low'
-        ]  # select any voice from - https://audeering.github.io/shift/
-d = docx.Document('../shift/assets/INCLUSION_IN_MUSEUMS_audiobook.docx')  # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
 last_paragraph_was_silence = False  # to know to add silence only once after only at the 1st empty paragraph we detect
 chapter_counter = 0  # assure chapters start with CHAPTER: ONCE UPON A TIME
-youtube_video_parts = []  # audiobook .mp4 from each voice
 for vox in voices:
-    # string (map for assets/)
     vox_str = vox.replace(
                 '/', '_').replace(
                 '#', '_').replace(
                 'cmu-arctic', 'cmu_arctic').replace(
                 '_low', '').replace('-','')
     # create dir for chapter_x.wav & audiobook.wav - for this voice vox
     Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
                                          exist_ok=True)
     print(vox)
     # for new voice start list of audio tiles making up the 1st chapter of book
     total = []
     chapter = []
-    for para in d.paragraphs[:41]:
         t = para.text
         # start new chapter
         if t.startswith('CHAPTER:'):
             # silence for end chapter
-            chapter.append(np.zeros(int(.1 * FS),
-                                    dtype=np.float32))
             # chapter.wav
             audio = np.concatenate(chapter)
             soundfile.write(
-                ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
-                audio,
-                FS)  # 27400?
             # fill AUDIO of this chapter into total (for complete audiobook)
             total.append(audio)
             # new chapter
             chapter = []
             chapter_counter += 1
         # If paragraph is non empty -> TTS
         if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
             # place paragraph text to .txt for tts.py
             with open('_tmp.txt', 'w') as f:
                 f.write(t.lower())  # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
-            print(t,'\n_____________________________\n')
             # TTS
             subprocess.run(
                 [
                 "python",
@@ -128,40 +118,41 @@ for vox in voices:
                 # '--scene', 'calm sounds of castle',
                 '--voice', vox,
                 '--out_file', '_tmp'  # save on _tmp load audio and concat to total
-                    ])
             audio, _fs = soundfile.read('out/_tmp.wav')
-            print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
             chapter.append(audio)
             # flag
             last_paragraph_was_silence = False
-        # append silence if empty paragraph (e.g. end of Section)
         else:
             if not last_paragraph_was_silence:  # skip multiple empty pargraphs - silence is added only once
                 chapter.append(np.zeros(int(.1 * FS),
-                                        dtype=np.float32))
                 last_paragraph_was_silence = True
     # save full .wav audiobook - for this voice
     soundfile.write(
-                    ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
-                    np.concatenate(total),
-                    FS)  # 27400?
     # pic TTS voice
-    voice_pic = np.zeros((768, 1024, 3), dtype=np.uint8)
     shift_logo = cv2.imread('assets/shift_banner.png')
@@ -177,27 +168,35 @@ for vox in voices:
     lineType               = 2
     # voice
     cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
-        bottomLeftCornerOfText,
-        font,
-        fontScale,
-        fontColor,
-        thickness,
-        lineType)
-    # =
     cv2.putText(voice_pic, 'TTS voice =',
-        (0, 500),
-        font,
-        fontScale,
-        fontColor,
-        thickness,
-        lineType)
     STATIC_FRAME = '_tmp.png'
     cv2.imwrite(STATIC_FRAME, voice_pic)
     # MoviePy silence video
     SILENT_VIDEO = '_tmp.mp4'
     # SILENT CLIP
@@ -207,54 +206,23 @@ for vox in voices:
     # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
     # write final output video
     subprocess.call(
         ["ffmpeg",
-            "-y",
-            "-i",
-            SILENT_VIDEO,
-            "-i",
-            ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
-            "-c:v",
-            "copy",
-            "-map",
-            "0:v:0",
-            "-map",
-            " 1:a:0",
-            ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4',       #  OUT_FILE
-            ])
-    youtube_video_parts.append(ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4')
-# Final vid for YouTube
-with open('_youtube_video_parts.txt', 'w') as f:
-    _str = 'file ' + ' \n file '.join(youtube_video_parts)
-    f.write(_str)
-# # list of audiobooks of single vox
-# # --
-# # $ cat mylist.txt
-# # file '/path/to/file1'
-# # file '/path/to/file2'
-# # file '/path/to/file3'
-youtube_video_file = 'audiobook_shift_youtube.mp4'
-# ffmpeg -f concat -i video_parts.txt -c copy output.mp4
-subprocess.call(
-            ["ffmpeg",
-                "-y",  # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
-                "-safe",
-                "0",  # https://stackoverflow.com/questions/38996925/ffmpeg-concat-unsafe-file-name
-                "-f",
-                "concat", # https://stackoverflow.com/questions/7333232/how-to-concatenate-two-mp4-files-using-ffmpeg
-                "-i",
-                '_youtube_video_parts.txt',
-                "-c",
-                "copy",
-                youtube_video_file]
-            )

+# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
+# __________________________________________________________________________________________________
+#   ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
+#   ROOT_DIR/voice/voice_full_book.wav
 import cv2
 import subprocess
 import numpy as np
 import soundfile
+import docx  # package = python-docx
+import audresample
+import urllib
 from pathlib import Path
 from moviepy.editor import *
 Path(ROOT_DIR).mkdir(parents=True,
                      exist_ok=True)
 voices = [
+    'en_US/vctk_low#p228',
+    # 'en_US/vctk_low#p326',
+    ]  # select any voice from - https://audeering.github.io/shift/
+urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "tmp.docx")
+d = docx.Document('tmp.docx')  # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
 last_paragraph_was_silence = False  # to know to add silence only once after only at the 1st empty paragraph we detect
 chapter_counter = 0  # assure chapters start with CHAPTER: ONCE UPON A TIME
 for vox in voices:
+    # string cleanup
     vox_str = vox.replace(
                 '/', '_').replace(
                 '#', '_').replace(
                 'cmu-arctic', 'cmu_arctic').replace(
                 '_low', '').replace('-','')
     # create dir for chapter_x.wav & audiobook.wav - for this voice vox
     Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
                                          exist_ok=True)
     print(vox)
     # for new voice start list of audio tiles making up the 1st chapter of book
     total = []
     chapter = []
+    for para in d.paragraphs:  #[:41]
         t = para.text
         # start new chapter
         if t.startswith('CHAPTER:'):
             # silence for end chapter
+            chapter.append(np.zeros(int(.24 * FS),
+            dtype=np.float32))
             # chapter.wav
             audio = np.concatenate(chapter)
             soundfile.write(
+                        ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
+                        audio,
+                        16000)  # 27400?
             # fill AUDIO of this chapter into total (for complete audiobook)
             total.append(audio)
             # new chapter
             chapter = []
             chapter_counter += 1
+            print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')
         # If paragraph is non empty -> TTS
         if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
             # place paragraph text to .txt for tts.py
             with open('_tmp.txt', 'w') as f:
                 f.write(t.lower())  # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
             # TTS
             subprocess.run(
                 [
                 "python",
                 # '--scene', 'calm sounds of castle',
                 '--voice', vox,
                 '--out_file', '_tmp'  # save on _tmp load audio and concat to total
+                ])
             audio, _fs = soundfile.read('out/_tmp.wav')
+            audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
+            # print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
             chapter.append(audio)
             # flag
             last_paragraph_was_silence = False
+            # append silence if empty paragraph (e.g. end of Section)
         else:
             if not last_paragraph_was_silence:  # skip multiple empty pargraphs - silence is added only once
                 chapter.append(np.zeros(int(.1 * FS),
+                               dtype=np.float32))
                 last_paragraph_was_silence = True
     # save full .wav audiobook - for this voice
     soundfile.write(
+            ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
+            np.concatenate(total),
+            16000)  # 27400?
     # pic TTS voice
+    voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)
     shift_logo = cv2.imread('assets/shift_banner.png')
     lineType               = 2
     # voice
     cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
+                bottomLeftCornerOfText,
+                font,
+                fontScale,
+                fontColor,
+                thickness,
+                lineType)
+    # = AUDIOBOOK
+    cv2.putText(voice_pic, 'AUDIOBOOK',
+                (170, 170),
+                font,
+                4,
+                fontColor,
+                thickness,
+                lineType)
+    # = VOICE
     cv2.putText(voice_pic, 'TTS voice =',
+                (0, 500),
+                font,
+                fontScale,
+                fontColor,
+                thickness,
+                lineType)
     STATIC_FRAME = '_tmp.png'
     cv2.imwrite(STATIC_FRAME, voice_pic)
     # MoviePy silence video
     SILENT_VIDEO = '_tmp.mp4'
     # SILENT CLIP
     # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
     # write final output video
     subprocess.call(
         ["ffmpeg",
+        "-y",
+        "-i",
+        SILENT_VIDEO,
+        "-i",
+        ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
+        "-c:v",
+        "copy",
+        "-map",
+        "0:v:0",
+        "-map",
+        " 1:a:0",
+        ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4',       #  OUT_FILE
+        ])

msinference.py CHANGED Viewed

@@ -366,23 +366,13 @@ class TextForeign(object):
         return text_norm
     def filter_oov(self, text, lang=None):
-        text = self.preprocess_char(text, lang=lang)
         val_chars = self._symbol_to_id
         txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
         return txt_filt
-    def preprocess_char(self, text, lang=None):
-        """
-        Special treatement of characters in certain languages
-        """
-        if lang == "ron":
-            text = text.replace("ț", "ţ")
-            print(f"{lang} (ț -> ţ): {text}")
-        return text
-def foreign(text=None, lang='romanian', speed=None):
-    # TTS for non english languages supported by
     # https://huggingface.co/spaces/mms-meta/MMS
     if 'hun' in lang.lower():
@@ -391,9 +381,9 @@ def foreign(text=None, lang='romanian', speed=None):
     elif 'ser' in lang.lower():
-        if has_cyrillic(text):
-            lang_code = 'rmc-script_cyrillic'   # romani carpathian (has also Vlax)
         else:
@@ -439,28 +429,39 @@ def foreign(text=None, lang='romanian', speed=None):
     # TTS via MMS
     is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
-    if is_uroman:
-        uroman_dir = "Utils/uroman"
-        assert os.path.exists(uroman_dir)
-        uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
-        text = text_mapper.uromanize(text, uroman_pl)
-    text = text.lower()
-    text = text_mapper.filter_oov(text, lang=lang)
-    stn_tst = text_mapper.get_text(text, hps)
-    with torch.no_grad():
-        print(f'{speed=}\n\n\n\n_______________________________')
-        x_tst = stn_tst.unsqueeze(0).to(device)
-        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
-        x = (
-            net_g.infer(
-                x_tst,
-                x_tst_lengths,
-                noise_scale=0.667,
-                noise_scale_w=0.8,
-                length_scale=1.0 / speed)[0][0, 0].cpu().float().numpy()
             )
     x /= np.abs(x).max() + 1e-7
     # hyp = (hyp * 32768).astype(np.int16)
@@ -470,6 +471,9 @@ def foreign(text=None, lang='romanian', speed=None):
     x = audresample.resample(signal=x.astype(np.float32),
                              original_rate=16000,
                              target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
     return x

         return text_norm
     def filter_oov(self, text, lang=None):
         val_chars = self._symbol_to_id
         txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
         return txt_filt
+def foreign(text=None,   # list of text
+            lang='romanian',
+            speed=None):
     # https://huggingface.co/spaces/mms-meta/MMS
     if 'hun' in lang.lower():
     elif 'ser' in lang.lower():
+        if has_cyrillic(text[0]):  # check 0-th sentence if is cyrillic
+            lang_code = 'rmc-script_cyrillic'   # romani carpathian (also has lating/cyrillic Vlax)
         else:
     # TTS via MMS
     is_uroman = hps.data.training_files.split(".")[-1] == "uroman"
+    # CALL TTS
+    x = []
+    for _t in text:
+        if is_uroman:
+            uroman_dir = "Utils/uroman"
+            assert os.path.exists(uroman_dir)
+            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
+            _t = text_mapper.uromanize(_t, uroman_pl)
+        _t = _t.lower().replace("ţ", "ț").replace('ț','ts') #.replace('ț', 'ts').replace('Ţ', 'ts').replace('î', 'u').replace('Î', 'u')
+        _t = text_mapper.filter_oov(_t, lang=lang)
+        # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
+        stn_tst = text_mapper.get_text(_t, hps)
+        with torch.no_grad():
+            x_tst = stn_tst.unsqueeze(0).to(device)
+            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
+            x.append(
+                net_g.infer(
+                    x_tst,
+                    x_tst_lengths,
+                    noise_scale=0.667,
+                    noise_scale_w=1, #0, #0.8,
+                    length_scale=1.0 / speed)[0][0, 0].cpu().float().numpy()
             )
+    x = np.concatenate(x)
     x /= np.abs(x).max() + 1e-7
     # hyp = (hyp * 32768).astype(np.int16)
     x = audresample.resample(signal=x.astype(np.float32),
                              original_rate=16000,
                              target_rate=24000)[0, :]  # reshapes (64,) -> (1,64)
     return x