Model Card for Model ID
This is Full Parameter Fine Tuned model based on llm-jp/llm-jp-3-13B
.
See the base details here.
Made for the task of elyza-tasks-100-TV
which Matsuo Lab made in a class.
Model Details
Model Description
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
- Developed by: Yuto-24
- Model type: Text Generation
- Language(s) (NLP): Japanese, English
- License: CC-BY-4.0
- Finetuned from model: llm-jp/llm-jp-3-13B
Model Sources [optional]
- Repository: coming soon...
Uses
Direct Use
numpy
torch>=2.3.0
datasets
transformers>=4.40.1
accelerate>=0.29.3
flash-attn>=2.5.8
FlagEmbedding
import torch
import numpy as np
from datasets import Dataset, load_dataset
from FlagEmbedding import BGEM3FlagModel
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextStreamer,
BitsAndBytesConfig,
)
elyza_tasks_datasets = load_dataset("elyza/ELYZA-tasks-100")
model = BGEM3FlagModel("BAAI/bge-m3")
target_texts = elyza_tasks_datasets["test"]["input"].copy()
target_embeds = model.encode(target_texts)["dense_vecs"]
def retrieve(input_text):
global target_embeds
input_texts = [input_text]
input_embeds = model.encode(input_texts)["dense_vecs"]
# 類似度の計算
similarity = input_embeds @ target_embeds.T
most_similar_text = target_texts[np.argmax(similarity)]
target_index = target_texts.index(most_similar_text)
return target_index
class CallLLM:
def __init__(self, model_name_or_path: str) -> None:
self.quantization_config = BitsAndBytesConfig(load_in_8bit=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
quantization_config=self.quantization_config,
trust_remote_code=True,
device_map="auto",
).eval()
self.tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True,
)
self.streamer = TextStreamer(
self.tokenizer,
)
self.call_type = None
print(f"{self.model.device = }")
def __call__(self, input_text: str, call_type: str = None, stream=False, **kwargs):
self.call_type = call_type
# print(f"Using call_type: {self.call_type}")
call_type_dict = {
"chat_template": self.__call_chat_template,
}
if self.call_type not in call_type_dict.keys():
raise ValueError(
f"Please set the call_type. You can select from {call_type_dict.keys()}"
)
output = call_type_dict[call_type](input_text.strip(), stream=stream, **kwargs)
return output
def merge_adapter(self, lora_adapter_path):
# PEFTモデルとしてLoRAアダプターをベースモデルに結合
self.model = PeftModel.from_pretrained(self.model, lora_adapter_path)
self.model = self.model.merge_and_unload()
def __call_chat_template(self, input_text: str = "", system_prompt: str = "あなたは、大塚商会の誠実で優秀なアシスタントです。", ** kwargs):
prompt = []
if system_prompt and system_prompt != "":
prompt.append({"role": "system", "content": system_prompt})
if input_text and input_text != "":
prompt.append({"role": "user", "content": input_text})
tokenized_input = self.tokenizer.apply_chat_template(
prompt,
return_tensors="pt",
)
output = self.__inference(tokenized_input, **kwargs)
return output
output = self.__inference(tokenized_input, **kwargs)
return output
def __inference(self, tokenized_input, stream: bool, **kwargs):
tokenized_input = tokenized_input.to(self.model.device)
attention_mask = torch.ones_like(tokenized_input)
default_inference_params = {
"attention_mask": attention_mask,
"max_new_tokens": 512,
"do_sample": False,
"repetition_penalty": 1.2,
"eos_token_id": self.tokenizer.eos_token_id,
"pad_token_id": self.tokenizer.eos_token_id,
# "eos_token_id": self.tokenizer.encode("<|im_end|>"),
}
inference_params = default_inference_params.copy()
inference_params.update(**kwargs)
if stream:
inference_params.update(streamer=self.streamer)
# Inference
with torch.no_grad():
outputs = self.model.generate(
tokenized_input,
**inference_params,
)[0]
output = self.tokenizer.decode(
outputs[tokenized_input.size(1):],
skip_special_tokens=True,
)
return output
model_path_or_id = "Yuto-24/llm-jp-3-13B-Tengentoppa_magpie"
# Loading model here.
llm = CallLLM(model_path_or_id)
SYSTEM_PROMPT = """
# あなたが必ず従うべき事項
## 役割
あなたは誠実で優秀なアシスタントです。
質問に対し、簡潔に答えます。
ハルシネーションをしません。
必ず正しい情報のみを答えます。
## 指示
- 評価観点に沿った出力を作成します。
- ユーザから特別な指示が与えられている場合には、必ず従います。
- 具体例には評価観点が含まれていますが、あなたが考える「出力」のみを回答してください。
- 評価観点は、人間があなたの出力を評価するために利用します。
- 論理的にステップバイステップで考えてください。
## 具体例
```markdown
{examples}
```
""".strip()
EXAMPLE_TEMPLATE = """
### 入力
{dataset_input}
### 評価観点
{dataset_eval_aspect}
### 出力
{dataset_answer}
""".strip()
# タスクとなるデータの読み込み
# omnicampusの開発環境では、左にタスクのjsonlをドラッグアンドドロップしてから実行
import os
import json
datasets = []
with open(f"{os.path.dirname(os.path.abspath('**file**'))}/workspace/elyza-tasks-100-TV_0.jsonl", "r") as f:
item = ""
for line in f:
line = line.strip()
item += line
if item.endswith("}"):
datasets.append(json.loads(item))
item = ""
# モデルによるタスクの推論。
import re
from tqdm import tqdm
results = []
n = 2
for data in tqdm(datasets, smoothing=0.0):
input_text = data["input"]
dataset_index_list = retrieve(input_text, n)
examples = ""
for dataset_index in dataset_index_list:
examples += EXAMPLE_TEMPLATE.format(
dataset_input=elyza_tasks_datasets["test"]["input"][dataset_index].strip(),
dataset_eval_aspect=elyza_tasks_datasets["test"]["eval_aspect"][dataset_index].strip(),
dataset_answer=elyza_tasks_datasets["test"]["output"][dataset_index].strip(),
)
system_prompt = SYSTEM_PROMPT.format(
examples=examples.strip(),
)
# print(examples)
# print(input_text)
output = llm(input_text=input_text,
system_prompt=system_prompt,
call_type="chat_template",
repetition_penalty=1.15,
# stream=True,
).strip()
# print("-----------------------------------------------------------------------------------------------------------------------------------")
print(output.strip())
print("===================================================================================================================================")
print(re.sub(r"^[\s\S]*?### 出力", "", re.sub(r"^[\s\S]*?\*\*出力\*\*:", "", output)).strip())
print("-----------------------------------------------------------------------------------------------------------------------------------")
results.append({
"task_id": data["task_id"],
"input": input_text,
"output_org": output.strip(),
"output": re.sub(r"^[\s\S]*?### 出力", "", output).strip(),
"elyza_tasks_id": dataset_index,
"dataset_input": elyza_tasks_datasets["test"]["input"][dataset_index],
"dataset_eval_aspect": elyza_tasks_datasets["test"]["eval_aspect"][dataset_index],
"dataset_answer": elyza_tasks_datasets["test"]["output"][dataset_index],
})
# results にタスクの解答が入っている
from pprint import pprint
import pandas as pd
# 最大表示「列」数の指定
pd.set_option("display.max_columns", 0)
# 最大表示「行」数の指定
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 550)
json4df = {
"task_id": [],
"input": [],
"output": [],
"output_org": [],
# "elyza_tasks_id": [],
# "dataset_input": [],
# "dataset_eval_aspect": [],
# "dataset_answer": [],
}
for result in results:
json4df["task_id"].append(result["task_id"])
json4df["input"].append(result["input"])
json4df["output_org"].append(result["output_org"])
json4df["output"].append(result["output"])
JSON_FILE_NAME = "llm-jp-3-13B-Tengentoppa-FPFT-magpie-FPFT-elyza-RAG_v2"
result4out = results.copy()
results
# 本コードではinputとeval_aspectも含んでいますが、なくても問題ありません。
# 必須なのはtask_idとoutputとなります。
import re
import sys
from os.path import dirname, abspath, join, isfile
result4out = results.copy()
WD = dirname(abspath("__file__"))
json_dir = join(
WD,
"..",
"jsonl",
)
if JSON_FILE_NAME != "":
file_path = join(json_dir, f"{JSON_FILE_NAME}.jsonl")
else:
jsonl_id = re.sub(".*/", "", merged_model_path)
file_path = join(json_dir, f"{jsonl_id}-outputs.jsonl")
assert not isfile(file_path), f"Error: File `{file_path}` is already exist."
with open(file_path, "w", encoding="utf-8") as f:
for result in result4out:
result = {k: v for k, v in result.items() if k != "elyza_tasks_id" and k != "dataset_input" and k !=
"dataset_eval_aspect" and k != "dataset_answer"}
json.dump(
result, f, ensure_ascii=False
) # ensure_ascii=False for handling non-ASCII characters
f.write("\n")
Downstream Use [optional]
[More Information Needed]
Out-of-Scope Use
[More Information Needed]
Bias, Risks, and Limitations
[More Information Needed]
Recommendations
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
How to Get Started with the Model
Use the code below to get started with the model.
[More Information Needed]
Training Details
Training Data
- DeL-TaiseiOzaki/Tengentoppa-sft-v1.0
- llm-jp/magpie-sft-v1.0
- ntotsuka123/clean3-ultraboros-20k-ja-filter
Training Procedure
using axolotl and yaml below.
base_model: llm-jp/llm-jp-3-13b
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
load_in_4bit: false
strict: false
# domain_yyyymmdd
output_dir: outputs/matsuo/llm-jp/3/13B/FPFT_20241213
chat_template: chatml
default_system_message: あなたは、大塚商会の誠実で優秀なアシスタントです。
shuffle_merged_datasets: true
datasets:
# # General
# - path: data/general/magpie-sft-v1.0.jsonl
# ds_type: json
# type: chat_template
# chat_template: chatml
# field_messages: conversations
# message_field_role: role
# message_field_content: content
# roles:
# user:
# - user
# assistant:
# - assistant
# system:
# - system
- path: data/general/Tengentoppa-sft-v1.0.jsonl
ds_type: json
type: alpaca
# - path: data/general/clean3-ultraboros-20k-ja-filter_train.jsonl
# ds_type: json
# type: chat_template
# # chat_template: chatml
# field_messages: conversations
# message_field_role: role
# message_field_content: value
# roles:
# user:
# - human
# assistant:
# - gpt
# system:
# - system
# train_on_eos: turn
val_set_size: 0.05
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.00002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: true
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true
# warmup_steps: 100
warmup_ratio: 0.1
evals_per_epoch: 1
eval_table_size:
saves_per_epoch: 1
debug:
deepspeed: deepspeed_configs/zero3.json
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
eos_token: <|im_end|>
base_model: outputs/matsuo/llm-jp/3/13B/FPFT_20241213
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
load_in_4bit: false
strict: false
# domain_yyyymmdd
output_dir: outputs/matsuo/llm-jp/3/13B/FPFT_20241215
chat_template: chatml
default_system_message: あなたは、大塚商会の誠実で優秀なアシスタントです。
shuffle_merged_datasets: true
datasets:
- path: data/general/magpie-sft-v1.0.jsonl
ds_type: json
type: chat_template
chat_template: chatml
field_messages: conversations
message_field_role: role
message_field_content: content
roles:
user:
- user
assistant:
- assistant
system:
- system
# - path: data/general/Tengentoppa-sft-v1.0.jsonl
# ds_type: json
# type: alpaca
- path: data/general/clean3-ultraboros-20k-ja-filter_train.jsonl
ds_type: json
type: chat_template
chat_template: chatml
field_messages: conversations
message_field_role: role
message_field_content: value
roles:
user:
- human
assistant:
- gpt
system:
- system
## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
roles_to_train: ["gpt", "assistant"]
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
# - all: train on all EOS tokens
# - turn: train on the EOS token at the end of each trainable turn
# - last: train on the last EOS token in the conversation
train_on_eos: last
val_set_size: 0.05
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.00002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: true
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true
# warmup_steps: 100
warmup_ratio: 0.1
evals_per_epoch: 1
eval_table_size:
saves_per_epoch: 1
debug:
deepspeed: deepspeed_configs/zero3.json
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
eos_token: <|im_end|>
Evaluation
Testing Data, Factors & Metrics
Testing Data
[More Information Needed]
Factors
[More Information Needed]
Metrics
[More Information Needed]
Results
[More Information Needed]
Summary
Model Examination [optional]
[More Information Needed]
Environmental Impact
Carbon emissions can be estimated using the Machine Learning Impact calculator presented in Lacoste et al. (2019).
- Hardware Type: [More Information Needed]
- Hours used: [More Information Needed]
- Cloud Provider: [More Information Needed]
- Compute Region: [More Information Needed]
- Carbon Emitted: [More Information Needed]
Technical Specifications [optional]
Model Architecture and Objective
[More Information Needed]
Compute Infrastructure
[More Information Needed]
Hardware
[More Information Needed]
Software
[More Information Needed]
Citation [optional]
BibTeX:
[More Information Needed]
APA:
[More Information Needed]
Glossary [optional]
[More Information Needed]
More Information [optional]
[More Information Needed]
Model Card Authors [optional]
[More Information Needed]
Model Card Contact
[More Information Needed]
- Downloads last month
- 7
Model tree for Yuto-24/llm-jp-3-13B-Tengentoppa_magpie
Base model
llm-jp/llm-jp-3-13b