MegaTronX commited on
Commit
578b041
·
verified ·
1 Parent(s): cce1340

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -3
app.py CHANGED
@@ -18,9 +18,67 @@ hf_hub_download(
18
 
19
 
20
  @spaces.GPU(duration=120) #Is this setting the timeout?
21
- def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
22
- # Simplified function logic for testing
23
- return f"Response: {message}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def create_interface(model_name):
26
  return gr.ChatInterface(
 
18
 
19
 
20
  @spaces.GPU(duration=120) #Is this setting the timeout?
21
+ def respond(
22
+ message,
23
+ history: list[tuple[str, str]],
24
+ model,
25
+ system_message,
26
+ max_tokens,
27
+ temperature,
28
+ top_p,
29
+ top_k,
30
+ repeat_penalty,
31
+ ):
32
+ chat_template = MessagesFormatterType.GEMMA_2
33
+
34
+ llm = Llama(
35
+ model_path=f"models/{model}",
36
+ flash_attn=True,
37
+ n_gpu_layers=81,
38
+ n_batch=1024,
39
+ n_ctx=8192,
40
+ )
41
+
42
+ provider = LlamaCppPythonProvider(llm)
43
+
44
+ # Configure sampling settings
45
+ settings = provider.get_provider_default_settings()
46
+ settings.temperature = temperature
47
+ settings.top_k = top_k
48
+ settings.top_p = top_p
49
+ settings.max_tokens = max_tokens
50
+ settings.repeat_penalty = repeat_penalty
51
+ settings.stream = True
52
+
53
+ # Prepare chat history
54
+ messages = BasicChatHistory()
55
+
56
+ for msn in history:
57
+ user_message = {'role': Roles.user, 'content': msn[0]}
58
+ assistant_message = {'role': Roles.assistant, 'content': msn[1]}
59
+ messages.add_message(user_message)
60
+ messages.add_message(assistant_message)
61
+
62
+ # Generate response using Llama model
63
+ try:
64
+ stream = llm(
65
+ prompt=message,
66
+ max_tokens=max_tokens,
67
+ temperature=temperature,
68
+ top_p=top_p,
69
+ top_k=top_k,
70
+ stop=["\n"], # Adjust stop conditions as needed
71
+ echo=False # Ensure only generated text is returned
72
+ )
73
+
74
+ outputs = ""
75
+ for output in stream:
76
+ outputs += output["choices"][0]["text"] # Extract text from response
77
+ yield outputs.strip()
78
+
79
+ except Exception as e:
80
+ yield f"Error: {str(e)}"
81
+
82
 
83
  def create_interface(model_name):
84
  return gr.ChatInterface(