NorGLM's picture
Update README.md
989f5f3 verified
|
raw
history blame
2.42 kB
metadata
license: cc-by-nc-sa-4.0
language:
  - 'no'

Model Card

NorGPT-369M-Instruction-peft is trained on top of NorGPT-369M model on NO-Alpaca dataset.

Prompt format:

{instruction} {input} : {output}

Inference prompt:

{instruction} {input} :

Run the Model

from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

source_model_id = "NorGLM/NorGPT-369M"
peft_model_id = "NorGLM/NorGPT-369M-Instruction-peft"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(source_model_id, device_map='balanced')

tokenizer_max_len = 2048
tokenizer_config = {'pretrained_model_name_or_path': source_model_id,
                            'max_len': tokenizer_max_len}
tokenizer = tokenizer = AutoTokenizer.from_pretrained(**tokenizer_config)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, peft_model_id)

Inference Example

Load the model to evaluate on the last 20% of NO-Alpaca dataset:

def merge_columns(example):
    if str(example["input"]) == "":
        example["text"] = str(example["instruction"]) + " : "
    else:
        example["text"] = str(example["instruction"]) + " " + str(example["input"]) + " : "
    return example

def generate_text(text, max_length=200, do_sample=True, top_p = 0.92, top_k=0):
    set_seed(42)
    model_inputs = tokenizer(text, return_tensors='pt').to(torch_device)
    output = model.generate(**model_inputs, max_new_tokens = max_length, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print("--LOADING EVAL DATAS---")
eval_data = load_dataset("NbAiLab/norwegian-alpaca", split='train[-20%:]')

print("--MAKING PREDICTIONS---")
model.eval()

output_file = <output file name>
with open(output_file, 'w', encoding='utf-8-sig') as file:
    generated_text = []
    
    for question in eval_data['text']:
        generated_text.append({"generated_text": generate_text(question)})
        print({"text_generated": len(generated_text)})

    json_lines = [json.dumps(data) for data in generated_text]
    json_data = "\n".join(json_lines)
    file.write(json_data)

Note

More training details will be released soon!