HuggingFace Newbie - What input does this model expect?

#1
by joe-muller - opened

I am fairly new to HuggingFace and deploying models for inference. I am using beam.cloud to deploy this model but I'm not sure how to actually use it. When I send a list of messages, the response contains an output field with what looks like gibberish text completion.

What type of input does this model expect? Is there some where to see that on HuggingFace?

How is this model intended to be used? Should I be constantly sending partial transcripts until it tells me the turn is over?

Thanks!

image.png

Here's a quick example of trying to use the model with the transformers library. What task should I use? In the example it says text-generation but that doesn't give me understandable results:

from transformers import pipeline, Pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
    {"role": "assistant", "content": "I am me."},
    {"role": "user", "content": "But who are you really?"},
    {"role": "assistant", "content": "I am me."},
    {"role": "user", "content": "But who does"}
]
pipe: Pipeline = pipeline("text-generation", model="livekit/turn-detector")
result = pipe(messages)
print(result)

Outputs:

[{'generated_text': [{'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': 'I am me.'}, {'role': 'user', 'content': 'But who are you really?'}, {'role': 'assistant', 'content': 'I am me.'}, {'role': 'user', 'content': 'But who does'}, {'role': 'assistant', 'content': 'youwhatwhatwhatwhat'}]}]

This seems to work better:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("livekit/turn-detector")

messages = [
    {"role": "user", "content": "Who are you?"},
    {"role": "assistant", "content": "I am John."},
    {"role": "user", "content": "What is your last name?"},
    {"role": "assistant", "content": "Smith."},
    {"role": "user", "content": "How do you spell the first"}
]

# Format messages using the chat template
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=False,
    add_special_tokens=False,
    tokenize=False
)

# Remove the EOU token from current utterance
ix = text.rfind("<|im_end|>")
text = text[:ix]

# Tokenize
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "livekit/turn-detector")

# Get prediction
with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    print("probabilities", probabilities)
    # Use index 1 for the positive class probability
    eou_probability = probabilities[0, 1].item()

print(f"End of utterance probability: {eou_probability}")

It outputs something like this:

probabilities tensor([[0.9695, 0.0305]])
End of utterance probability: 0.030476752668619156

I imagine the first value in the tensor is the probability the speech will continue and the second value is the probability the speech is finished.

This seems to work better:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("livekit/turn-detector")

messages = [
    {"role": "user", "content": "Who are you?"},
    {"role": "assistant", "content": "I am John."},
    {"role": "user", "content": "What is your last name?"},
    {"role": "assistant", "content": "Smith."},
    {"role": "user", "content": "How do you spell the first"}
]

# Format messages using the chat template
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=False,
    add_special_tokens=False,
    tokenize=False
)

# Remove the EOU token from current utterance
ix = text.rfind("<|im_end|>")
text = text[:ix]

# Tokenize
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "livekit/turn-detector")

# Get prediction
with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    print("probabilities", probabilities)
    # Use index 1 for the positive class probability
    eou_probability = probabilities[0, 1].item()

print(f"End of utterance probability: {eou_probability}")

It outputs something like this:

probabilities tensor([[0.9695, 0.0305]])
End of utterance probability: 0.030476752668619156

I imagine the first value in the tensor is the probability the speech will continue and the second value is the probability the speech is finished.

However, I found that after using this code, for the same input each time the probability result varies, even from 0 to 1.

This seems to work better:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("livekit/turn-detector")

messages = [
    {"role": "user", "content": "Who are you?"},
    {"role": "assistant", "content": "I am John."},
    {"role": "user", "content": "What is your last name?"},
    {"role": "assistant", "content": "Smith."},
    {"role": "user", "content": "How do you spell the first"}
]

# Format messages using the chat template
text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=False,
    add_special_tokens=False,
    tokenize=False
)

# Remove the EOU token from current utterance
ix = text.rfind("<|im_end|>")
text = text[:ix]

# Tokenize
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "livekit/turn-detector")

# Get prediction
with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    print("probabilities", probabilities)
    # Use index 1 for the positive class probability
    eou_probability = probabilities[0, 1].item()

print(f"End of utterance probability: {eou_probability}")

It outputs something like this:

probabilities tensor([[0.9695, 0.0305]])
End of utterance probability: 0.030476752668619156

I imagine the first value in the tensor is the probability the speech will continue and the second value is the probability the speech is finished.

However, I found that after using this code, for the same input each time the probability result varies, even from 0 to 1.

model = AutoModelForSequenceClassification.from_pretrained(
    "livekit/turn-detector")

This adds an untrained classification head to the model, which results in random outputs.

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at livekit/turn-detector and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
import os
import onnxruntime as ort
import numpy as np
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download, errors

# Constants
HG_MODEL = "livekit/turn-detector"
ONNX_FILENAME = "model_q8.onnx"
MODEL_REVISION = "v1.2.0"
MAX_HISTORY = 4
MAX_HISTORY_TOKENS = 512

# Ensure model file exists or download it
try:
    model_path = hf_hub_download(
        repo_id=HG_MODEL,
        filename=ONNX_FILENAME,
        subfolder="onnx",
        revision=MODEL_REVISION,
        local_files_only=False,  # Set to True if you've already downloaded it
    )
except errors.LocalEntryNotFoundError:
    raise RuntimeError(f"Could not find {ONNX_FILENAME}. Make sure the model is available on Hugging Face.")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    HG_MODEL,
    revision=MODEL_REVISION,
    truncation_side="left",
)

# Load ONNX model
session = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])

# Chat context
chat_ctx = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing well, thank you! How can I assist you today?"},
    {"role": "user", "content": "I need help with my project."}
]

chat_ctx = chat_ctx[-MAX_HISTORY:]

def format_chat_ctx(chat_ctx: list[dict]) -> str:
    new_chat_ctx = []
    for msg in chat_ctx:
        content = msg["content"]
        if not content:
            continue
        new_chat_ctx.append(msg)

    convo_text = tokenizer.apply_chat_template(
        new_chat_ctx,
        add_generation_prompt=False,
        add_special_tokens=False,
        tokenize=False,
    )

    # Remove the EOU token from current utterance
    ix = convo_text.rfind("<|im_end|>")
    text = convo_text[:ix] if ix != -1 else convo_text
    return text

# Format chat context
text = format_chat_ctx(chat_ctx)

# Tokenize input
inputs = tokenizer(
    text,
    add_special_tokens=False,
    return_tensors="np",  # ONNX requires NumPy format
    max_length=MAX_HISTORY_TOKENS,
    truncation=True
)

# Run inference
outputs = session.run(None, {"input_ids": inputs["input_ids"]})
eou_probability = outputs[0][0]  # Extract probability

# Output result
print(f"End-of-Utterance Probability: {eou_probability}")

Sign up or log in to comment