Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,353 Bytes
abed9dd ae274fc 249f47f 31a98e0 4f262bf 249f47f 3f30de8 249f47f ae274fc 4adfe65 249f47f 4adfe65 249f47f ae274fc 249f47f ae274fc 249f47f abed9dd 2c6942f 249f47f abed9dd 249f47f 1298db3 adfb0a2 1298db3 76c7f05 e9e6104 76c7f05 249f47f abed9dd 249f47f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr
text_generator = None
is_hugging_face = False
def init():
global text_generator
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
if not huggingface_token:
pass
print("no HUGGINGFACE_TOKEN if you need set secret ")
#raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model_id = "google/gemma-2b"
device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cuda"
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
print(model_id,device,dtype)
histories = []
#model = None
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
if not is_hugging_face:
if next(model.parameters()).is_cuda:
print("The model is on a GPU")
else:
print("The model is on a CPU")
#print(f"text_generator.device='{text_generator.device}")
if str(text_generator.device).strip() == 'cuda':
print("The pipeline is using a GPU")
else:
print("The pipeline is using a CPU")
print("initialized")
@spaces.GPU(duration=120)
def generate_text(messages):
global text_generator
if is_hugging_face:#need everytime initialize for ZeroGPU
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7)
generated_output = result[0]["generated_text"]
if isinstance(generated_output, list):
for message in reversed(generated_output):
if message.get("role") == "assistant":
content= message.get("content", "No content found.")
return content
return "No assistant response found."
else:
return "Unexpected output format."
def call_generate_text(message, history):
# history.append({"role": "user", "content": message})
print(message)
print(history)
messages = history+[{"role":"user","content":message}]
try:
text = generate_text(messages)
return text
except RuntimeError as e:
print(f"An unexpected error occurred: {e}")
return ""
js = """
function(chatbot){
console.log(chatbot)
}
"""
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
chatbot.change(None,[chatbot],[],js=js)
msg = gr.Textbox()
clear = gr.ClearButton([msg, chatbot])
#demo = gr.ChatInterface(call_generate_text,chatbot=chatbot,type="messages")
msg.submit(respond, [msg, chatbot], [msg, chatbot])
if __name__ == "__main__":
init()
demo.launch(share=True) |