File size: 1,938 Bytes
67b4877 1c86444 07faeb9 342194b 07faeb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
---
base_model: unsloth/llama-3-8b-bnb-4bit
library_name: peft 0.13.2
license: mit
datasets:
- yahma/alpaca-cleaned
language:
- en
---
How to use :
```python
!pip install peft accelerate bitsandbytes
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from peft import PeftModel, PeftConfig
# Load model and tokenizer configurations
config = PeftConfig.from_pretrained("Vijayendra/QST-Llama-8b")
base_model = AutoModelForCausalLM.from_pretrained("unsloth/llama-3-8b-bnb-4bit")
model = PeftModel.from_pretrained(base_model, "Vijayendra/QST-Llama-8b")
tokenizer = AutoTokenizer.from_pretrained("Vijayendra/llama3.0-8B-merged-4bit")
# Ensure padding token is set for the tokenizer
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Define the inference function with TextStreamer
def generate_answer_with_stream(model, tokenizer, text, max_new_tokens=1024, temperature=0.5, top_k=40, top_p=0.9):
prompt = f"Answer the following question\n\n{text}\n\nQuestion:"
# Tokenize the input text
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
# Initialize the TextStreamer
streamer = TextStreamer(tokenizer)
# Generate answer using the model with streaming
with torch.no_grad():
model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=True,
top_k=top_k,
top_p=top_p,
repetition_penalty=1.2,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
streamer=streamer # Stream output as it's generated
)
# Input Question
question = "What is quantum mechanics?"
# Generate and print answer
generate_answer_with_stream(model, tokenizer, question)
|