Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,067 Bytes
abed9dd ae274fc 249f47f 31a98e0 249f47f 3f30de8 249f47f ae274fc 4adfe65 249f47f 4adfe65 249f47f ae274fc 249f47f ae274fc 249f47f abed9dd 2c6942f 249f47f abed9dd 249f47f 1298db3 249f47f abed9dd 249f47f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr
text_generator = None
is_hugging_face = False
def init():
global text_generator
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
if not huggingface_token:
pass
print("no HUGGINGFACE_TOKEN if you need set secret ")
#raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cuda"
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
print(model_id,device,dtype)
histories = []
#model = None
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
if not is_hugging_face:
if next(model.parameters()).is_cuda:
print("The model is on a GPU")
else:
print("The model is on a CPU")
#print(f"text_generator.device='{text_generator.device}")
if str(text_generator.device).strip() == 'cuda':
print("The pipeline is using a GPU")
else:
print("The pipeline is using a CPU")
print("initialized")
@spaces.GPU(duration=120)
def generate_text(messages):
global text_generator
if is_hugging_face:#need everytime initialize for ZeroGPU
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7)
generated_output = result[0]["generated_text"]
if isinstance(generated_output, list):
for message in reversed(generated_output):
if message.get("role") == "assistant":
content= message.get("content", "No content found.")
return content
return "No assistant response found."
else:
return "Unexpected output format."
def call_generate_text(message, history):
# history.append({"role": "user", "content": message})
print(message)
print(history)
messages = history+[{"role":"user","content":message}]
try:
text = generate_text(messages)
return text
except RuntimeError as e:
print(f"An unexpected error occurred: {e}")
return ""
js = """
console.log('hello js')
"""
demo = gr.ChatInterface(call_generate_text,type="messages",js=js)
if __name__ == "__main__":
init()
demo.launch(share=True) |