from typing import Dict, List, Any # from datasets import load_dataset # from transformers import AutoProcessor, MusicgenForConditionalGeneration import torch, numpy as np import io, os import soundfile as sf from audiocraft.models import MusicGen import yaml import math import base64 import torchaudio import torch def get_bip_bip( bip_duration=0.125, frequency=440, duration=0.5, sample_rate=32000, device="cuda"): """Generates a series of bip bip at the given frequency.""" t = torch.arange( int(duration * sample_rate), device="cuda", dtype=torch.float) / sample_rate wav = torch.cos(2 * math.pi * 440 * t)[None] tp = (t % (2 * bip_duration)) / (2 * bip_duration) envelope = (tp >= 0.5).float() return wav * envelope def load_conf(conf): with open(conf,'r') as f: conf= yaml.safe_load(f) return conf class generator: def __init__(self, conf_file): """ conf{ model sampling_rate } """ self.conf = load_conf(conf_file) # self.processor = AutoProcessor.from_pretrained( # self.conf['model'], token=os.environ.get('token', None)) self.model = MusicGen.get_pretrained( self.conf['model']) self.model.set_generation_params( use_sampling=True, top_k=250, duration=self.conf['duration'] ) device = "cuda" if torch.cuda.is_available() else "cpu" # self.model.to(device) self.sampling_rate = self.model.sample_rate # config.audio_encoder.sampling_rate def preprocess(self, text, audio=None): print(f"Preprocess: Text:{text is not None}{len(text)} | audio:{audio is not None}{len(audio)}") # if audio is not None and float(self.conf['nth_slice_prompt']) >=2: # audio = audio[: int(len(audio) // self.conf['nth_slice_prompt'])] # print(f"Post-Preprocess: Text:{len(text)} | audio:{len(audio)}") audio = audio.unsqueeze(0) print(f"Text:{len(text)} | audio:{len(audio)}") return text, audio def generate(self, text:list, audio: np.array=None, args:dict={}): """ text: ["modern melodic electronic dance music", "80s blues track with groovy saxophone"] audio (np.array) """ # inputs = self.processor( # audio=audio, # sampling_rate=self.conf["sampling_rate"], # text=text, # padding=True, # return_tensors="pt", # ) if args.get('sr'): sr = args.get('sr') else: sr = self.conf['sampling_rate'] text, audio = self.preprocess(text, audio) if self.conf['model'] == 'melody' and audio is not None: output = self.model.generate_with_chroma( descriptions=[ text ], melody_wavs=audio, melody_sample_rate=sr, # progress=True ) else: output = self.model.generate_continuation( get_bip_bip(0.125), # .expand(2, -1, -1), 32000, text, # progress_bar=True ) return output class EndpointHandler: def __init__(self, path=""): # load model and processor from path # self.model = MusicGen.from_pretrained( # path, torch_dtype=torch.float16).to("cuda") self.generator = generator(os.path.join(path, '.conf/generation_conf.yaml')) def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: """ Args: data (:dict:): The payload with the text prompt and generation parameters. """ # prompt_duration = 2 # process input inputs = data.pop('inputs') text = inputs.pop("text", inputs) audio = inputs.pop("audio", inputs) print(type(text), type(audio), len(text), len(audio)) parameters = data.pop("parameters", None) audio = base64.b64decode(audio) audio, sr = torchaudio.load(io.BytesIO(audio)) # audio, sr = sf.read(io.BytesIO(audio)) output = self.generator.generate(text, audio, {'sr':sr}) # # pass inputs with all kwargs in data # if parameters is not None: # with torch.autocast("cuda"): # outputs = self.model.generate(**inputs, **parameters) # else: # with torch.autocast("cuda"): # outputs = self.model.generate(**inputs,) # postprocess the prediction prediction = output.squeeze().cpu().numpy().tolist() return [{"generated_audio": prediction}]