Spaces:
Sleeping
Sleeping
import gradio as gr | |
from musiclang_predict import MusicLangPredictor | |
import random | |
import subprocess | |
import os | |
import torchaudio | |
import torch | |
import numpy as np | |
from audiocraft.models import MusicGen | |
from audiocraft.data.audio import audio_write | |
from pydub import AudioSegment | |
import spaces | |
# Utility Functions | |
def peak_normalize(y, target_peak=0.97): | |
return target_peak * (y / np.max(np.abs(y))) | |
def rms_normalize(y, target_rms=0.05): | |
return y * (target_rms / np.sqrt(np.mean(y**2))) | |
def preprocess_audio(waveform): | |
waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy | |
# processed_waveform_np = rms_normalize(peak_normalize(waveform_np)) | |
return torch.from_numpy(waveform_np).unsqueeze(0).to(device) | |
def create_slices(song, sr, slice_duration, bpm, num_slices=5): | |
song_length = song.shape[-1] / sr | |
slices = [] | |
# Ensure the first slice is from the beginning of the song | |
first_slice_waveform = song[..., :int(slice_duration * sr)] | |
slices.append(first_slice_waveform) | |
for i in range(1, num_slices): | |
possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr))) | |
if not possible_start_indices: | |
# If there are no valid start indices, duplicate the first slice | |
slices.append(first_slice_waveform) | |
continue | |
random_start = random.choice(possible_start_indices) | |
slice_end = random_start + int(slice_duration * sr) | |
if slice_end > song_length * sr: | |
# Wrap around to the beginning of the song | |
remaining_samples = int(slice_end - song_length * sr) | |
slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1) | |
else: | |
slice_waveform = song[..., random_start:slice_end] | |
if len(slice_waveform.squeeze()) < int(slice_duration * sr): | |
additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze()) | |
slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1) | |
slices.append(slice_waveform) | |
return slices | |
def calculate_duration(bpm, min_duration=29, max_duration=30): | |
single_bar_duration = 4 * 60 / bpm | |
bars = max(min_duration // single_bar_duration, 1) | |
while single_bar_duration * bars < min_duration: | |
bars += 1 | |
duration = single_bar_duration * bars | |
while duration > max_duration and bars > 1: | |
bars -= 1 | |
duration = single_bar_duration * bars | |
return duration | |
def generate_music(seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm): | |
while True: | |
try: | |
if seed == "": | |
seed = random.randint(1, 10000) | |
ml = MusicLangPredictor('musiclang/musiclang-v2') | |
try: | |
seed = int(seed) | |
except ValueError: | |
seed = random.randint(1, 10000) | |
nb_tokens = 1024 | |
temperature = 0.9 | |
top_p = 1.0 | |
if use_chords and chord_progression.strip(): | |
score = ml.predict_chords( | |
chord_progression, | |
time_signature=(4, 4), | |
temperature=temperature, | |
topp=top_p, | |
rng_seed=seed | |
) | |
else: | |
score = ml.predict( | |
nb_tokens=nb_tokens, | |
temperature=temperature, | |
topp=top_p, | |
rng_seed=seed | |
) | |
midi_filename = f"output_{seed}.mid" | |
wav_filename = midi_filename.replace(".mid", ".wav") | |
score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4)) | |
subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"]) | |
# Load the generated audio | |
song, sr = torchaudio.load(wav_filename) | |
song = song.to(device) | |
# Use the user-provided BPM value for duration calculation | |
duration = calculate_duration(bpm) | |
# Create slices from the song using the user-provided BPM value | |
slices = create_slices(song, sr, 35, bpm, num_slices=5) | |
# Load the model | |
model_name = musicgen_model.split(" ")[0] | |
model_continue = MusicGen.get_pretrained(model_name) | |
# Setting generation parameters | |
model_continue.set_generation_params( | |
use_sampling=True, | |
top_k=250, | |
top_p=0.0, | |
temperature=1.0, | |
duration=duration, | |
cfg_coef=3 | |
) | |
all_audio_files = [] | |
for i in range(num_iterations): | |
slice_idx = i % len(slices) | |
print(f"Running iteration {i + 1} using slice {slice_idx}...") | |
prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)] | |
prompt_waveform = preprocess_audio(prompt_waveform) | |
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True) | |
output = output.cpu() # Move the output tensor back to CPU | |
# Make sure the output tensor has at most 2 dimensions | |
if len(output.size()) > 2: | |
output = output.squeeze() | |
filename_without_extension = f'continue_{i}' | |
filename_with_extension = f'{filename_without_extension}.wav' | |
audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True) | |
all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav | |
# Combine all audio files | |
combined_audio = AudioSegment.empty() | |
for filename in all_audio_files: | |
combined_audio += AudioSegment.from_wav(filename) | |
combined_audio_filename = f"combined_audio_{seed}.mp3" | |
combined_audio.export(combined_audio_filename, format="mp3") | |
# Clean up temporary files | |
os.remove(midi_filename) | |
os.remove(wav_filename) | |
for filename in all_audio_files: | |
os.remove(filename) | |
return combined_audio_filename | |
except IndexError: | |
# Retry with a new random seed if an IndexError is raised | |
seed = random.randint(1, 10000) | |
# Check if CUDA is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Define the expandable sections | |
musiclang_blurb = """ | |
## musiclang | |
musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally. | |
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict) | |
[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict) | |
""" | |
musicgen_blurb = """ | |
## musicgen | |
musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results. | |
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft) | |
visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action. | |
see also https://youtube.com/@thecollabagepatch | |
""" | |
finetunes_blurb = """ | |
## fine-tuned models | |
the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra. | |
[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ) | |
[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb) | |
""" | |
# Create the Gradio interface | |
with gr.Blocks() as iface: | |
gr.Markdown("# the-slot-machine") | |
gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.") | |
gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model continue, semi-randomly, from different sections of the midi track. the slot machine combines em all at the end into something very bizarre. pick a number for the seed between 1 and 10k, or leave it blank to unlock the full rnjesus powers. if you wanna be lame, you can control the chord progression, prompt duration, musicgen model, number of iterations, and BPM.") | |
with gr.Accordion("more info", open=False): | |
gr.Markdown(musiclang_blurb) | |
gr.Markdown(musicgen_blurb) | |
gr.Markdown(finetunes_blurb) | |
with gr.Row(): | |
with gr.Column(): | |
seed = gr.Textbox(label="seed (leave blank for random)", value="") | |
use_chords = gr.Checkbox(label="control chord progression", value=False) | |
chord_progression = gr.Textbox(label="chord progression (e.g., Am CM Dm E7 Am)", visible=True) | |
prompt_duration = gr.Dropdown(label="prompt duration (seconds)", choices=list(range(1, 11)), value=7) | |
musicgen_models = [ | |
"thepatch/vanya_ai_dnb_0.1 (small)", | |
"thepatch/budots_remix (small)", | |
"thepatch/PhonkV2 (small)", | |
"thepatch/bleeps-medium (medium)", | |
"thepatch/hoenn_lofi (large)" | |
] | |
musicgen_model = gr.Dropdown(label="musicGen model", choices=musicgen_models, value=musicgen_models[0]) | |
num_iterations = gr.Slider(label="number of iterations", minimum=1, maximum=10, step=1, value=3) | |
bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=140) | |
generate_button = gr.Button("generate music") | |
with gr.Column(): | |
output_audio = gr.Audio(label="your track") | |
generate_button.click(generate_music, inputs=[seed, use_chords, chord_progression, prompt_duration, musicgen_model, num_iterations, bpm], outputs=output_audio) | |
iface.launch() |