File size: 4,504 Bytes
de4700d 3e868b0 4057199 3e868b0 de4700d 4057199 de4700d 4057199 3e868b0 b79ef24 de4700d 3e868b0 da96e7b 29f41c1 fff4954 29f41c1 d64f2e3 78bb6ec 731a894 364427b 29f41c1 78bb6ec 40102ba 979f2ee 29f41c1 91c3db5 29f41c1 50ab13e 29f41c1 50ab13e 29f41c1 abe5fd4 29f41c1 627310b 29f41c1 ccf3a39 29f41c1 8d5225b 3e868b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
---
license: llama3
language:
- en
library_name: transformers
pipeline_tag: text-generation
tags:
- Text Generation
- Transformers
- llama
- llama-3
- 8B
- nvidia
- facebook
- meta
- LLM
- fine-tuned
- insurance
- research
- pytorch
- instruct
- chatqa-1.5
- chatqa
- finetune
- gpt4
- conversational
- text-generation-inference
- Inference Endpoints
datasets:
- InsuranceQA
base_model: "nvidia/Llama3-ChatQA-1.5-8B"
finetuned: "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B"
quantized: "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B-GGUF"
---
### 1. Finetuned on Base Model: nvidia/Llama3-ChatQA-1.5-8B using LoRA (8-bit)
### 2. Dataset: Subset of InsuranceQA
<br>
# Python Script for inference using llama_cpp_python with local CPU
- Windows
```bash
pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
```
- Mac
```bash
pip uninstall -y llama-cpp-python && pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
```
```bash
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
```
```python
import os
import psutil
import time
import glob
from pathlib import Path
from llama_cpp import Llama
from textwrap import dedent
import multiprocessing
from huggingface_hub import hf_hub_download
def download_model():
model_name = "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B-GGUF"
model_file = "openinsurancellm-llama3-8b.Q5_K_M.gguf"
model_1_path = hf_hub_download(model_name,
filename = model_file,
local_dir = os.path.join(os.getcwd(), 'gguf_dir'))
return model_1_path
def load_model(n_ctx, n_threads, n_batch, n_gpu_layers):
quantized_path = "gguf_dir/"
MODEL_DIR = os.path.join(os.getcwd(), quantized_path)
try:
directory = Path(MODEL_DIR)
model_1_path = str(list(directory.glob('openinsurancellm*Q5*.gguf'))[0])
except:
model_1_path = download_model()
llm_ctx = Llama(model_path=model_1_path,
n_gpu_layers=n_gpu_layers, # No GPU layers
n_ctx=n_ctx,
n_threads=n_threads,
n_batch=n_batch,
verbose=False)
return llm_ctx
def get_prompt(Question):
System = """You are an expert and experienced from the Insurance domain with extensive insurance knowledge and professional writter with all the insurance policies.
Your name is OpenInsuranceLLM, and you were developed by Raj Maharajwala. who's willing to help answer the user's query with explanation.
In your explanation, leverage your deep insurance expertise such as relevant insurance policies, complex coverage plans, or other pertinent insurance concepts.
Use precise insurance terminology while still aiming to make the explanation clear and accessible to a general audience."""
prompt = f"system\n{System}\nuser\Insurance Question: {Question}\nassistant\nInsurance Answer: "
return prompt
def inference_loop(max_tokens=8025, top_k=15, n_gpu_layers=0, temperature=0.0, n_ctx=8192, n_threads=multiprocessing.cpu_count() - 1, n_batch=512):
# Load the model
print("Welcome to OpenInsuranceLLM Inference Loop:\n\n")
llm_ctx = load_model(n_ctx, n_threads, n_batch, n_gpu_layers)
print(f"OpenInsuranceLLM Q5_K_M model loaded successfully with n_batch={n_batch}!\n\nEnter your question (or type 'exit' to quit)\n")
while True:
Question = input("Raj: ").strip()
if Question.lower() == "exit":
print("Assistant: Good Bye!")
break
prompt = get_prompt(Question)
print("Raj: " + Question)
start_time = time.time()
response = llm_ctx(prompt, max_tokens=max_tokens, top_k=top_k, temperature=temperature)
ntokens = response['usage']['completion_tokens']
ntokens = 1 if ntokens == 0 else ntokens
response = dedent(response['choices'][0]['text'])
execution_time = time.time() - start_time
print(f"Assistant: {response}")
print(f"tokens: {ntokens}")
print(f"Time: {execution_time:.2f} s Per Token: {(1.0*execution_time / ntokens):.2f} s Token/sec: {round(1.0*ntokens/execution_time)} tokens/s\n\n\n")
#default params: inference_loop(max_tokens=8025, top_k=15, n_gpu_layers=0, temperature=0.0, n_ctx=8192, n_threads=32, n_batch=512):
inference_loop(top_k=10)
```
Arxiv : [https://arxiv.org/pdf/2401.10225](https://arxiv.org/pdf/2401.10225) |