|
--- |
|
language: |
|
- ja |
|
base_model: |
|
- google/gemma-2-2b-jpn-it |
|
pipeline_tag: audio-text-to-text |
|
license: gemma |
|
datasets: |
|
- fixie-ai/common_voice_17_0 |
|
--- |
|
```py |
|
import transformers |
|
import librosa |
|
import torch |
|
import numpy as np |
|
from typing import Dict, Any |
|
|
|
model = transformers.AutoModel.from_pretrained( |
|
"neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True |
|
) |
|
model.to("cuda", dtype=torch.bfloat16) |
|
processor = transformers.AutoProcessor.from_pretrained( |
|
"neody/ultravox-gemma-2-2b-jpn-it", trust_remote_code=True |
|
) |
|
path = "record.wav" |
|
audio, sr = librosa.load(path, sr=16000) |
|
|
|
|
|
def preprocess(inputs: Dict[str, Any], device, dtype): |
|
turns: list = inputs.get("turns", []) |
|
|
|
audio = inputs.get("audio", None) |
|
# Convert to float32 if needed. |
|
if isinstance(audio, np.ndarray): |
|
if audio.dtype == np.float64: |
|
audio = audio.astype(np.float32) |
|
elif audio.dtype == np.int16: |
|
audio = audio.astype(np.float32) / np.float32(32768.0) |
|
elif audio.dtype == np.int32: |
|
audio = audio.astype(np.float32) / np.float32(2147483648.0) |
|
|
|
if audio is not None and (len(turns) == 0 or turns[-1]["role"] != "user"): |
|
prompt = inputs.get("prompt", "<|audio|>") |
|
if "<|audio|>" not in prompt: |
|
print( |
|
"Prompt does not contain '<|audio|>', appending '<|audio|>' to the end of the prompt." |
|
) |
|
|
|
prompt += " <|audio|>" |
|
turns.append({"role": "user", "content": prompt}) |
|
|
|
text = processor.tokenizer.apply_chat_template( |
|
turns, add_generation_prompt=True, tokenize=False |
|
) |
|
|
|
if "sampling_rate" not in inputs and audio is not None: |
|
print( |
|
"No sampling rate provided, using default of 16kHz. We highly recommend providing the correct sampling rate." |
|
) |
|
|
|
output = processor( |
|
text=text, |
|
audio=audio, |
|
sampling_rate=inputs.get("sampling_rate", 16000), |
|
) |
|
if "audio_values" in output: |
|
output["audio_values"] = output["audio_values"].to(device, dtype) |
|
return output.to(device, dtype) |
|
|
|
|
|
turns = [] |
|
print( |
|
processor.tokenizer.decode( |
|
model.generate( |
|
**preprocess( |
|
{"audio": audio, "turns": turns, "sampling_rate": sr}, |
|
"cuda", |
|
torch.bfloat16, |
|
), |
|
max_new_tokens=300, |
|
).squeeze(), |
|
skip_special_tokens=True, |
|
) |
|
) |
|
``` |