MegaTronX commited on
Commit
1f3c9c5
·
verified ·
1 Parent(s): b0e034d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import json
3
+ import subprocess
4
+ from llama_cpp import Llama
5
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
6
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
7
+ from llama_cpp_agent.chat_history import BasicChatHistory
8
+ from llama_cpp_agent.chat_history.messages import Roles
9
+ import gradio as gr
10
+ from huggingface_hub import hf_hub_download
11
+
12
+
13
+ hf_hub_download(
14
+ repo_id="tHottie/NeuralDaredevil-8B-abliterated-Q4_K_M-GGUF",
15
+ filename="neuraldaredevil-8b-abliterated-q4_k_m-imat.gguf",
16
+ local_dir="./models"
17
+ )
18
+
19
+
20
+ @spaces.GPU(duration=120) #Is this setting the timeout?
21
+ def respond(
22
+ message,
23
+ history: list[tuple[str, str]],
24
+ model,
25
+ system_message,
26
+ max_tokens,
27
+ temperature,
28
+ top_p,
29
+ top_k,
30
+ repeat_penalty,
31
+ ):
32
+ chat_template = MessagesFormatterType.GEMMA_2
33
+
34
+ llm = Llama(
35
+ model_path=f"models/{model}",
36
+ flash_attn=True,
37
+ n_gpu_layers=81,
38
+ n_batch=1024,
39
+ n_ctx=8192,
40
+ )
41
+ provider = LlamaCppPythonProvider(llm)
42
+
43
+ agent = LlamaCppAgent(
44
+ provider,
45
+ system_prompt=f"{system_message}",
46
+ predefined_messages_formatter_type=chat_template,
47
+ debug_output=True
48
+ )
49
+
50
+ settings = provider.get_provider_default_settings()
51
+ settings.temperature = temperature
52
+ settings.top_k = top_k
53
+ settings.top_p = top_p
54
+ settings.max_tokens = max_tokens
55
+ settings.repeat_penalty = repeat_penalty
56
+ settings.stream = True
57
+
58
+ messages = BasicChatHistory()
59
+
60
+ for msn in history:
61
+ user = {
62
+ 'role': Roles.user,
63
+ 'content': msn[0]
64
+ }
65
+ assistant = {
66
+ 'role': Roles.assistant,
67
+ 'content': msn[1]
68
+ }
69
+ messages.add_message(user)
70
+ messages.add_message(assistant)
71
+
72
+ stream = agent.get_chat_response(
73
+ message,
74
+ llm_sampling_settings=settings,
75
+ chat_history=messages,
76
+ returns_streaming_generator=True,
77
+ print_output=False
78
+ )
79
+
80
+ outputs = ""
81
+ for output in stream:
82
+ outputs += output
83
+ yield outputs
84
+
85
+
86
+ def create_interface(model_name, description):
87
+ return gr.ChatInterface(
88
+ respond,
89
+ additional_inputs=[
90
+ gr.Textbox(value=model_name, label="Model", interactive=False),
91
+ #gr.Textbox(value="You are a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
92
+ gr.Textbox(value=""),
93
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
94
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.3, step=0.1, label="Temperature"),
95
+ gr.Slider(
96
+ minimum=0.1,
97
+ maximum=1.0,
98
+ value=0.90,
99
+ step=0.05,
100
+ label="Top-p",
101
+ ),
102
+ gr.Slider(
103
+ minimum=0,
104
+ maximum=100,
105
+ value=40,
106
+ step=1,
107
+ label="Top-k",
108
+ ),
109
+ gr.Slider(
110
+ minimum=0.0,
111
+ maximum=2.0,
112
+ value=1.1,
113
+ step=0.1,
114
+ label="Repetition penalty",
115
+ ),
116
+ ],
117
+ retry_btn="Retry",
118
+ undo_btn="Undo",
119
+ clear_btn="Clear",
120
+ submit_btn="Send",
121
+ title=f"{model_name}",
122
+ description=description,
123
+ chatbot=gr.Chatbot(
124
+ scale=1,
125
+ likeable=False,
126
+ show_copy_button=True
127
+ )
128
+ )
129
+
130
+ description = """<p align="center"NeuralDareDevil_8B_Abliterated_Q4_GGUF</p>"""
131
+ interface = create_interface('neuraldaredevil-8b-abliterated-q4_k_m-imat.gguf', description)
132
+
133
+ demo = gr.Blocks()
134
+
135
+ with demo:
136
+ interface.render()
137
+
138
+ if __name__ == "__main__":
139
+ demo.launch()