File size: 3,353 Bytes
abed9dd
 
 
 
 
ae274fc
249f47f
 
 
 
 
 
 
 
 
 
31a98e0
4f262bf
249f47f
 
3f30de8
249f47f
 
 
 
 
 
 
ae274fc
4adfe65
249f47f
 
4adfe65
 
 
 
 
249f47f
 
 
 
ae274fc
249f47f
 
 
 
 
ae274fc
249f47f
abed9dd
 
 
2c6942f
249f47f
 
 
 
 
 
abed9dd
 
 
 
 
 
 
 
 
 
 
 
249f47f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1298db3
adfb0a2
 
 
1298db3
76c7f05
 
e9e6104
76c7f05
 
 
 
 
249f47f
abed9dd
249f47f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

text_generator = None
is_hugging_face = False
def init():
    global text_generator
    huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
    if not huggingface_token:
        pass
        print("no HUGGINGFACE_TOKEN if you need set secret ")
        #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
    
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    model_id = "google/gemma-2b"
    
    device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = "cuda"
    dtype = torch.bfloat16
    
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
    
    print(model_id,device,dtype)
    histories = []
    #model = None

    model = AutoModelForCausalLM.from_pretrained(
            model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
        )
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
       
    
    if not is_hugging_face:
         
        if next(model.parameters()).is_cuda:
            print("The model is on a GPU")
        else:
            print("The model is on a CPU")
    
        #print(f"text_generator.device='{text_generator.device}")
        if str(text_generator.device).strip() == 'cuda':
            print("The pipeline is using a GPU")
        else:
            print("The pipeline is using a CPU")
    
    print("initialized")

@spaces.GPU(duration=120)
def generate_text(messages):
    global text_generator
    if is_hugging_face:#need everytime initialize for ZeroGPU
        model = AutoModelForCausalLM.from_pretrained(
                model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
            )
        text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
    result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7)

    generated_output = result[0]["generated_text"]
    if isinstance(generated_output, list):
        for message in reversed(generated_output):
            if message.get("role") == "assistant":
                content= message.get("content", "No content found.")
                return content
            
        return "No assistant response found."
    else:
        return "Unexpected output format."



def call_generate_text(message, history):
   # history.append({"role": "user", "content": message})
    print(message)
    print(history)
   
    messages = history+[{"role":"user","content":message}]
    try:
        text = generate_text(messages)
        return text
    except RuntimeError  as e:
        print(f"An unexpected error occurred: {e}")
       
    return ""

js = """
function(chatbot){
console.log(chatbot)
}
"""
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    chatbot.change(None,[chatbot],[],js=js)
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])
    
#demo = gr.ChatInterface(call_generate_text,chatbot=chatbot,type="messages")
    msg.submit(respond, [msg, chatbot], [msg, chatbot])

if __name__ == "__main__":
    init()
    demo.launch(share=True)