--- license: llama3 language: - en library_name: transformers pipeline_tag: text-generation tags: - Text Generation - Transformers - llama - llama-3 - 8B - nvidia - facebook - meta - LLM - fine-tuned - insurance - research - pytorch - instruct - chatqa-1.5 - chatqa - finetune - gpt4 - conversational - text-generation-inference - Inference Endpoints datasets: - InsuranceQA base_model: "nvidia/Llama3-ChatQA-1.5-8B" finetuned: "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B" quantized: "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B-GGUF" --- ### 1. Finetuned on Base Model: nvidia/Llama3-ChatQA-1.5-8B using LoRA (8-bit) ### 2. Dataset: Subset of InsuranceQA
# Python Script for inference using llama_cpp_python with local CPU - Windows ```bash pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu ``` - Mac ```bash pip uninstall -y llama-cpp-python && pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal ``` ```bash CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python ``` ```python import os import psutil import time import glob from pathlib import Path from llama_cpp import Llama from textwrap import dedent import multiprocessing from huggingface_hub import hf_hub_download def download_model(): model_name = "Raj-Maharajwala/OpenInsuranceLLM-Llama3-8B-GGUF" model_file = "openinsurancellm-llama3-8b.Q5_K_M.gguf" model_1_path = hf_hub_download(model_name, filename = model_file, local_dir = os.path.join(os.getcwd(), 'gguf_dir')) return model_1_path def load_model(n_ctx, n_threads, n_batch, n_gpu_layers): quantized_path = "gguf_dir/" MODEL_DIR = os.path.join(os.getcwd(), quantized_path) try: directory = Path(MODEL_DIR) model_1_path = str(list(directory.glob('openinsurancellm*Q5*.gguf'))[0]) except: model_1_path = download_model() llm_ctx = Llama(model_path=model_1_path, n_gpu_layers=n_gpu_layers, # No GPU layers n_ctx=n_ctx, n_threads=n_threads, n_batch=n_batch, verbose=False) return llm_ctx def get_prompt(Question): System = """You are an expert and experienced from the Insurance domain with extensive insurance knowledge and professional writter with all the insurance policies. Your name is OpenInsuranceLLM, and you were developed by Raj Maharajwala. who's willing to help answer the user's query with explanation. In your explanation, leverage your deep insurance expertise such as relevant insurance policies, complex coverage plans, or other pertinent insurance concepts. Use precise insurance terminology while still aiming to make the explanation clear and accessible to a general audience.""" prompt = f"system\n{System}\nuser\Insurance Question: {Question}\nassistant\nInsurance Answer: " return prompt def inference_loop(max_tokens=8025, top_k=15, n_gpu_layers=0, temperature=0.0, n_ctx=8192, n_threads=multiprocessing.cpu_count() - 1, n_batch=512): # Load the model print("Welcome to OpenInsuranceLLM Inference Loop:\n\n") llm_ctx = load_model(n_ctx, n_threads, n_batch, n_gpu_layers) print(f"OpenInsuranceLLM Q5_K_M model loaded successfully with n_batch={n_batch}!\n\nEnter your question (or type 'exit' to quit)\n") while True: Question = input("Raj: ").strip() if Question.lower() == "exit": print("Assistant: Good Bye!") break prompt = get_prompt(Question) print("Raj: " + Question) start_time = time.time() response = llm_ctx(prompt, max_tokens=max_tokens, top_k=top_k, temperature=temperature) ntokens = response['usage']['completion_tokens'] ntokens = 1 if ntokens == 0 else ntokens response = dedent(response['choices'][0]['text']) execution_time = time.time() - start_time print(f"Assistant: {response}") print(f"tokens: {ntokens}") print(f"Time: {execution_time:.2f} s Per Token: {(1.0*execution_time / ntokens):.2f} s Token/sec: {round(1.0*ntokens/execution_time)} tokens/s\n\n\n") #default params: inference_loop(max_tokens=8025, top_k=15, n_gpu_layers=0, temperature=0.0, n_ctx=8192, n_threads=32, n_batch=512): inference_loop(top_k=10) ``` Arxiv : [https://arxiv.org/pdf/2401.10225](https://arxiv.org/pdf/2401.10225)