Spaces:

MegaTronX
/

Abliterated-NeuralDaredevil-Llama-3_1-8B

Running on Zero

App Files Files Community

MegaTronX commited on 30 days ago

Commit

578b041

verified ·

1 Parent(s): cce1340

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -3

app.py CHANGED Viewed

@@ -18,9 +18,67 @@ hf_hub_download(
 @spaces.GPU(duration=120)  #Is this setting the timeout?
-def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
-    # Simplified function logic for testing
-    return f"Response: {message}"
 def create_interface(model_name):
     return gr.ChatInterface(

 @spaces.GPU(duration=120)  #Is this setting the timeout?
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    model,
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+    top_k,
+    repeat_penalty,
+):
+    chat_template = MessagesFormatterType.GEMMA_2
+    llm = Llama(
+        model_path=f"models/{model}",
+        flash_attn=True,
+        n_gpu_layers=81,
+        n_batch=1024,
+        n_ctx=8192,
+    )
+    provider = LlamaCppPythonProvider(llm)
+    # Configure sampling settings
+    settings = provider.get_provider_default_settings()
+    settings.temperature = temperature
+    settings.top_k = top_k
+    settings.top_p = top_p
+    settings.max_tokens = max_tokens
+    settings.repeat_penalty = repeat_penalty
+    settings.stream = True
+    # Prepare chat history
+    messages = BasicChatHistory()
+    for msn in history:
+        user_message = {'role': Roles.user, 'content': msn[0]}
+        assistant_message = {'role': Roles.assistant, 'content': msn[1]}
+        messages.add_message(user_message)
+        messages.add_message(assistant_message)
+    # Generate response using Llama model
+    try:
+        stream = llm(
+            prompt=message,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            stop=["\n"],  # Adjust stop conditions as needed
+            echo=False  # Ensure only generated text is returned
+        )
+        outputs = ""
+        for output in stream:
+            outputs += output["choices"][0]["text"]  # Extract text from response
+            yield outputs.strip()
+    except Exception as e:
+        yield f"Error: {str(e)}"
 def create_interface(model_name):
     return gr.ChatInterface(