Spaces:

nroggendorff
/

dolphin

Paused

File size: 1,209 Bytes

0d15563
018a68d
 
6196dea
33093f6
6196dea
5c3c196
018a68d
 
 
 
 
 
 
6196dea
018a68d
 
 
295b4eb
6196dea
018a68d
 
b2dcefa
018a68d
 
 
 
 
482f6f1
018a68d
6196dea
018a68d
482f6f1
699a605
f9588c9
018a68d

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from spaces import GPU

GPU = lambda: GPU(duration=70)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "cognitivecomputations/dolphin-2.5-mixtral-8x7b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)

@GPU
def predict(input_text, history):
    chat = []
    for item in history:
        chat.append({"role": "user", "content": item[0]})
        if item[1] is not None:
            chat.append({"role": "assistant", "content": item[1]})
    chat.append({"role": "user", "content": input_text})
    
    conv = tokenizer.apply_chat_template(chat, tokenize=False)
    inputs = tokenizer(conv, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=2048)
    
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return generated_text.split("<|assistant|>")[-1]

gr.ChatInterface(predict, theme="soft").launch()