import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread model = AutoModelForCausalLM.from_pretrained("Sigurdur/icechat") tokenizer = AutoTokenizer.from_pretrained("Sigurdur/icechat") def streaming_respond(question, history): input_ids = tokenizer.encode(f"### Question:\n{question}\n\n### Answer:\n", return_tensors="pt") streamer = TextIteratorStreamer( tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True ) generate_kwargs = dict( {"input_ids": input_ids}, streamer=streamer, max_new_tokens=100, temperature=0.7, num_beams=1, ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] for text in streamer: outputs.append(text) yield "".join(outputs) gr.ChatInterface(streaming_respond).launch()