File size: 4,504 Bytes
de4700d
3e868b0
 
4057199
 
3e868b0
de4700d
 
 
 
 
 
 
 
 
 
 
 
 
4057199
de4700d
 
 
 
 
 
 
 
4057199
 
3e868b0
b79ef24
 
 
de4700d
 
3e868b0
da96e7b
29f41c1
fff4954
29f41c1
 
d64f2e3
78bb6ec
 
731a894
364427b
 
29f41c1
78bb6ec
 
 
 
 
40102ba
 
 
 
979f2ee
29f41c1
 
 
91c3db5
29f41c1
 
 
50ab13e
29f41c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50ab13e
29f41c1
 
 
 
 
 
 
 
 
 
 
 
 
abe5fd4
29f41c1
 
 
 
627310b
29f41c1
 
 
ccf3a39
29f41c1
 
 
8d5225b
 
3e868b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
---
license: llama3
language:
- en
library_name: transformers
pipeline_tag: text-generation
tags:
- Text Generation
- Transformers
- llama
- llama-3
- 8B
- nvidia
- facebook
- meta
- LLM
- fine-tuned
- insurance
- research
- pytorch
- instruct
- chatqa-1.5
- chatqa
- finetune
- gpt4
- conversational
- text-generation-inference
- Inference Endpoints
datasets:
- InsuranceQA

base_model: "nvidia/Llama3-ChatQA-1.5-8B"
finetuned: "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B"
quantized: "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B-GGUF"
---


### 1. Finetuned on Base Model: nvidia/Llama3-ChatQA-1.5-8B using LoRA (8-bit)

### 2. Dataset: Subset of InsuranceQA
<br>

# Python Script for inference using llama_cpp_python with local CPU

- Windows
```bash
pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
```

- Mac
```bash
pip uninstall -y llama-cpp-python && pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
```

```bash
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
```

```python
import os
import psutil
import time
import glob
from pathlib import Path
from llama_cpp import Llama
from textwrap import dedent
import multiprocessing
from huggingface_hub import hf_hub_download

def download_model():
    model_name = "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B-GGUF"
    model_file = "openinsurancellm-llama3-8b.Q5_K_M.gguf"
    model_1_path = hf_hub_download(model_name,
                                    filename = model_file,
                                    local_dir = os.path.join(os.getcwd(), 'gguf_dir'))
    return model_1_path

def load_model(n_ctx, n_threads, n_batch, n_gpu_layers):
    quantized_path = "gguf_dir/" 
    MODEL_DIR = os.path.join(os.getcwd(), quantized_path)
    try:
      directory = Path(MODEL_DIR)
      model_1_path = str(list(directory.glob('openinsurancellm*Q5*.gguf'))[0])
    except:
      model_1_path = download_model()

    llm_ctx = Llama(model_path=model_1_path,
                    n_gpu_layers=n_gpu_layers,  # No GPU layers
                    n_ctx=n_ctx,
                    n_threads=n_threads,
                    n_batch=n_batch,
                    verbose=False)
    return llm_ctx


def get_prompt(Question):
    System = """You are an expert and experienced from the Insurance domain with extensive insurance knowledge and professional writter with all the insurance policies.
    Your name is OpenInsuranceLLM, and you were developed by Raj Maharajwala. who's willing to help answer the user's query with explanation.
    In your explanation, leverage your deep insurance expertise such as relevant insurance policies, complex coverage plans, or other pertinent insurance concepts.
    Use precise insurance terminology while still aiming to make the explanation clear and accessible to a general audience."""

    prompt = f"system\n{System}\nuser\Insurance Question: {Question}\nassistant\nInsurance Answer: "
    return prompt


def inference_loop(max_tokens=8025, top_k=15, n_gpu_layers=0, temperature=0.0, n_ctx=8192, n_threads=multiprocessing.cpu_count() - 1, n_batch=512):
    # Load the model
    print("Welcome to OpenInsuranceLLM Inference Loop:\n\n")

    llm_ctx = load_model(n_ctx, n_threads, n_batch, n_gpu_layers)
    print(f"OpenInsuranceLLM Q5_K_M model loaded successfully with n_batch={n_batch}!\n\nEnter your question (or type 'exit' to quit)\n")

    while True:
        Question = input("Raj: ").strip()
        if Question.lower() == "exit":
            print("Assistant: Good Bye!")
            break

        prompt = get_prompt(Question)
        print("Raj: " + Question)
        start_time = time.time()
        response = llm_ctx(prompt, max_tokens=max_tokens, top_k=top_k, temperature=temperature)
        ntokens = response['usage']['completion_tokens']
        ntokens = 1 if ntokens == 0 else ntokens
        response = dedent(response['choices'][0]['text'])
        execution_time = time.time() - start_time
        print(f"Assistant: {response}")
        print(f"tokens: {ntokens}")
        print(f"Time: {execution_time:.2f} s  Per Token: {(1.0*execution_time / ntokens):.2f} s  Token/sec: {round(1.0*ntokens/execution_time)} tokens/s\n\n\n")

#default params: inference_loop(max_tokens=8025, top_k=15, n_gpu_layers=0, temperature=0.0, n_ctx=8192, n_threads=32, n_batch=512):
inference_loop(top_k=10) 
```

Arxiv : [https://arxiv.org/pdf/2401.10225](https://arxiv.org/pdf/2401.10225)