Spaces:
Sleeping
Sleeping
import gradio as gr | |
from musiclang_predict import MusicLangPredictor | |
import random | |
import subprocess | |
import os | |
import torchaudio | |
import torch | |
import numpy as np | |
from audiocraft.models import MusicGen | |
from audiocraft.data.audio import audio_write | |
from pydub import AudioSegment | |
import spaces | |
import tempfile | |
from pydub import AudioSegment | |
# Check if CUDA is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Utility Functions | |
def peak_normalize(y, target_peak=0.97): | |
return target_peak * (y / np.max(np.abs(y))) | |
def rms_normalize(y, target_rms=0.05): | |
return y * (target_rms / np.sqrt(np.mean(y**2))) | |
def preprocess_audio(waveform): | |
waveform_np = waveform.cpu().squeeze().numpy() # Move to CPU before converting to NumPy | |
# processed_waveform_np = rms_normalize(peak_normalize(waveform_np)) | |
return torch.from_numpy(waveform_np).unsqueeze(0).to(device) | |
def create_slices(song, sr, slice_duration, bpm, num_slices=5): | |
song_length = song.shape[-1] / sr | |
slices = [] | |
# Ensure the first slice is from the beginning of the song | |
first_slice_waveform = song[..., :int(slice_duration * sr)] | |
slices.append(first_slice_waveform) | |
for i in range(1, num_slices): | |
possible_start_indices = list(range(int(slice_duration * sr), int(song_length * sr), int(4 * 60 / bpm * sr))) | |
if not possible_start_indices: | |
# If there are no valid start indices, duplicate the first slice | |
slices.append(first_slice_waveform) | |
continue | |
random_start = random.choice(possible_start_indices) | |
slice_end = random_start + int(slice_duration * sr) | |
if slice_end > song_length * sr: | |
# Wrap around to the beginning of the song | |
remaining_samples = int(slice_end - song_length * sr) | |
slice_waveform = torch.cat([song[..., random_start:], song[..., :remaining_samples]], dim=-1) | |
else: | |
slice_waveform = song[..., random_start:slice_end] | |
if len(slice_waveform.squeeze()) < int(slice_duration * sr): | |
additional_samples_needed = int(slice_duration * sr) - len(slice_waveform.squeeze()) | |
slice_waveform = torch.cat([slice_waveform, song[..., :additional_samples_needed]], dim=-1) | |
slices.append(slice_waveform) | |
return slices | |
def calculate_duration(bpm, min_duration=29, max_duration=30): | |
single_bar_duration = 4 * 60 / bpm | |
bars = max(min_duration // single_bar_duration, 1) | |
while single_bar_duration * bars < min_duration: | |
bars += 1 | |
duration = single_bar_duration * bars | |
while duration > max_duration and bars > 1: | |
bars -= 1 | |
duration = single_bar_duration * bars | |
return duration | |
def generate_midi(seed, use_chords, chord_progression, bpm): | |
if seed == "": | |
seed = random.randint(1, 10000) | |
ml = MusicLangPredictor('musiclang/musiclang-v2') | |
try: | |
seed = int(seed) | |
except ValueError: | |
seed = random.randint(1, 10000) | |
nb_tokens = 1024 | |
temperature = 0.9 | |
top_p = 1.0 | |
if use_chords and chord_progression.strip(): | |
score = ml.predict_chords( | |
chord_progression, | |
time_signature=(4, 4), | |
temperature=temperature, | |
topp=top_p, | |
rng_seed=seed | |
) | |
else: | |
score = ml.predict( | |
nb_tokens=nb_tokens, | |
temperature=temperature, | |
topp=top_p, | |
rng_seed=seed | |
) | |
midi_filename = f"output_{seed}.mid" | |
wav_filename = midi_filename.replace(".mid", ".wav") | |
score.to_midi(midi_filename, tempo=bpm, time_signature=(4, 4)) | |
subprocess.run(["fluidsynth", "-ni", "font.sf2", midi_filename, "-F", wav_filename, "-r", "44100"]) | |
# Clean up temporary MIDI file | |
os.remove(midi_filename) | |
sample_rate = 44100 # Assuming fixed sample rate from fluidsynth command | |
return wav_filename | |
def generate_music(wav_filename, prompt_duration, musicgen_model, num_iterations, bpm): | |
# Load the audio from the passed file path | |
song, sr = torchaudio.load(wav_filename) | |
song = song.to(device) | |
# Use the user-provided BPM value for duration calculation | |
duration = calculate_duration(bpm) | |
# Create slices from the song using the user-provided BPM value | |
slices = create_slices(song, sr, 35, bpm, num_slices=5) | |
# Load the model | |
model_name = musicgen_model.split(" ")[0] | |
model_continue = MusicGen.get_pretrained(model_name) | |
# Setting generation parameters | |
model_continue.set_generation_params( | |
use_sampling=True, | |
top_k=250, | |
top_p=0.0, | |
temperature=1.0, | |
duration=duration, | |
cfg_coef=3 | |
) | |
all_audio_files = [] | |
for i in range(num_iterations): | |
slice_idx = i % len(slices) | |
print(f"Running iteration {i + 1} using slice {slice_idx}...") | |
prompt_waveform = slices[slice_idx][..., :int(prompt_duration * sr)] | |
prompt_waveform = preprocess_audio(prompt_waveform) | |
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True) | |
output = output.cpu() # Move the output tensor back to CPU | |
# Make sure the output tensor has at most 2 dimensions | |
if len(output.size()) > 2: | |
output = output.squeeze() | |
filename_without_extension = f'continue_{i}' | |
filename_with_extension = f'{filename_without_extension}.wav' | |
audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True) | |
all_audio_files.append(f'{filename_without_extension}.wav.wav') # Assuming the library appends an extra .wav | |
# Combine all audio files | |
combined_audio = AudioSegment.empty() | |
for filename in all_audio_files: | |
combined_audio += AudioSegment.from_wav(filename) | |
combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3" | |
combined_audio.export(combined_audio_filename, format="mp3") | |
# Clean up temporary files | |
for filename in all_audio_files: | |
os.remove(filename) | |
return combined_audio_filename | |
def continue_music(input_audio_path, prompt_duration, musicgen_model, num_iterations, bpm): | |
# Load the audio from the given file path | |
song, sr = torchaudio.load(input_audio_path) | |
song = song.to(device) | |
# Calculate the slice from the end of the song based on prompt_duration | |
num_samples = int(prompt_duration * sr) | |
if song.shape[-1] < num_samples: | |
raise ValueError("The prompt_duration is longer than the audio length.") | |
start_idx = song.shape[-1] - num_samples | |
prompt_waveform = song[..., start_idx:] | |
# Prepare the audio slice for generation | |
prompt_waveform = preprocess_audio(prompt_waveform) | |
# Load the model and set generation parameters | |
model_continue = MusicGen.get_pretrained(musicgen_model.split(" ")[0]) | |
model_continue.set_generation_params( | |
use_sampling=True, | |
top_k=250, | |
top_p=0.0, | |
temperature=1.0, | |
duration=calculate_duration(bpm), | |
cfg_coef=3 | |
) | |
original_audio = AudioSegment.from_mp3(input_audio_path) | |
all_audio_files = [original_audio] # Start with the original audio | |
file_paths_for_cleanup = [] # List to track generated file paths for cleanup | |
for i in range(num_iterations): | |
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True) | |
output = output.cpu() # Move the output tensor back to CPU | |
if len(output.size()) > 2: | |
output = output.squeeze() | |
filename_without_extension = f'continue_{i}' | |
filename_with_extension = f'{filename_without_extension}.wav' | |
correct_filename_extension = f'{filename_without_extension}.wav.wav' # Apply the workaround for audio_write | |
audio_write(filename_with_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True) | |
new_audio_segment = AudioSegment.from_wav(correct_filename_extension) | |
all_audio_files.append(new_audio_segment) | |
file_paths_for_cleanup.append(correct_filename_extension) # Add to cleanup list | |
# Combine all audio files into one continuous segment | |
combined_audio = sum(all_audio_files) | |
combined_audio_filename = f"combined_audio_{random.randint(1, 10000)}.mp3" | |
combined_audio.export(combined_audio_filename, format="mp3") | |
# Clean up temporary files using the list of file paths | |
for file_path in file_paths_for_cleanup: | |
os.remove(file_path) | |
return combined_audio_filename | |
# Define the expandable sections | |
musiclang_blurb = """ | |
## musiclang | |
musiclang is a controllable ai midi model. it can generate midi sequences based on user-provided parameters, or unconditionally. | |
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> musiclang github](https://github.com/MusicLang/musiclang_predict) | |
[<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Hugging Face" width="20" style="vertical-align:middle"> musiclang huggingface space](https://huggingface.co/spaces/musiclang/musiclang-predict) | |
""" | |
musicgen_blurb = """ | |
## musicgen | |
musicgen is a transformer-based music model that generates audio. It can also do something called a continuation, which was initially meant to extend musicgen outputs beyond 30 seconds. it can be used with any input audio to produce surprising results. | |
[<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" width="20" style="vertical-align:middle"> audiocraft github](https://github.com/facebookresearch/audiocraft) | |
visit https://thecollabagepatch.com/infinitepolo.mp3 or https://thecollabagepatch.com/audiocraft.mp3 to hear continuations in action. | |
see also https://youtube.com/@thecollabagepatch | |
""" | |
finetunes_blurb = """ | |
## fine-tuned models | |
the fine-tunes hosted on the huggingface hub are provided collectively by the musicgen discord community. thanks to vanya, mj, hoenn, septicDNB and of course, lyra. | |
[<img src="https://cdn.iconscout.com/icon/free/png-256/discord-3691244-3073764.png" alt="Discord" width="20" style="vertical-align:middle"> musicgen discord](https://discord.gg/93kX8rGZ) | |
[<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="vertical-align:middle"> fine-tuning colab notebook by lyra](https://colab.research.google.com/drive/13tbcC3A42KlaUZ21qvUXd25SFLu8WIvb) | |
""" | |
# Define the fine-tunes blurb for each model | |
fine_tunes_info = """ | |
## thepatch/vanya_ai_dnb_0.1 | |
thepatch/vanya_ai_dnb_0.1 was trained by vanya. [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@veryVANYA) . it treats almost all input audio as the beginning of a buildup to a dnb drop (can do downtempo well) | |
## thepatch/bleeps-medium | |
thepatch/bleeps-medium was trained by kevin and lyra [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@_lyraaaa_) . it is a medium model. it's more melodic and ambient sometimes than vanya's, but there's a 50/50 chance it gets real heavy with the edm vibes. It can be amazing at turning your chords into pads, and is a good percussionist. | |
## thepatch/budots_remix | |
thepatch/budots_remix was trained by MJ BERSABEph. budots is a dope niche genre from the philippines apparently. this one will often do fascinating, demonic, kinds of vocal chopping. warning: it tends to speed up and slow down tempo, which makes it hard to use in a daw. | |
## thepatch/hoenn_lofi | |
thepatch/hoenn_lofi is a large fine-tune by hoenn. [![Twitter](https://huggingface.co/front/assets/huggingface_logo-noborder.svg)](https://twitter.com/@eschatolocation) . this model is a large boi, and it shows. even tho it is trained to do lo-fi, its ability to run with your melodies and not ruin them is unparalleled among the fine-tunes so far. | |
## thepatch/PhonkV2 | |
thepatch/PhonkV2 was trained by MJ BERSABEph. there are multiple versions in the discord. | |
""" | |
# Create the Gradio interface | |
with gr.Blocks() as iface: | |
gr.Markdown("# the-slot-machine") | |
gr.Markdown("two ai's jamming. warning: outputs will be very strange, likely stupid, and possibly rad.") | |
gr.Markdown("this is a musical slot machine. using musiclang, we get a midi output. then, we let a musicgen model to continue the from the beginning of the midi model's generation. then, musicgen can continue from the end of its own output. re-upload, trim and repeat with a different fine-tune and prompt duration for the coolest outputs.") | |
with gr.Accordion("more info", open=False): | |
gr.Markdown(musiclang_blurb) | |
gr.Markdown(musicgen_blurb) | |
gr.Markdown(finetunes_blurb) | |
with gr.Accordion("fine-tunes info", open=False): | |
gr.Markdown(fine_tunes_blurb) | |
with gr.Row(): | |
with gr.Column(): | |
seed = gr.Textbox(label="Seed (leave blank for random)", value="") | |
use_chords = gr.Checkbox(label="Control Chord Progression", value=False) | |
chord_progression = gr.Textbox(label="Chord Progression (e.g., Am CM Dm E7 Am)", visible=True) | |
bpm = gr.Slider(label="BPM", minimum=60, maximum=200, step=1, value=120) | |
generate_midi_button = gr.Button("Generate MIDI") | |
midi_audio = gr.Audio(label="Generated MIDI Audio", type="filepath") # Ensure this is set to handle file paths | |
with gr.Column(): | |
prompt_duration = gr.Dropdown(label="Prompt Duration (seconds)", choices=list(range(1, 11)), value=5) | |
musicgen_model = gr.Dropdown(label="MusicGen Model", choices=[ | |
"thepatch/vanya_ai_dnb_0.1 (small)", | |
"thepatch/budots_remix (small)", | |
"thepatch/PhonkV2 (small)", | |
"thepatch/bleeps-medium (medium)", | |
"thepatch/hoenn_lofi (large)" | |
], value="thepatch/vanya_ai_dnb_0.1 (small)") | |
generate_music_button = gr.Button("Generate Music") | |
output_audio = gr.Audio(label="Generated Music", type="filepath") | |
continue_button = gr.Button("Continue Generating Music") | |
continue_output_audio = gr.Audio(label="Continued Music Output", type="filepath") | |
# Connecting the components | |
generate_midi_button.click(generate_midi, inputs=[seed, use_chords, chord_progression, bpm], outputs=[midi_audio]) | |
generate_music_button.click(generate_music, inputs=[midi_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=[output_audio]) | |
continue_button.click(continue_music, inputs=[output_audio, prompt_duration, musicgen_model, num_iterations, bpm], outputs=continue_output_audio) | |
iface.launch() |