diff --git a/LLM-Detector-V4-11w/src/api_demo.py b/LLM-Detector-V4-11w/src/api_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..a9afba4970fd8ef2f173972511957637eebc908d --- /dev/null +++ b/LLM-Detector-V4-11w/src/api_demo.py @@ -0,0 +1,14 @@ +import uvicorn + +from llmtuner import ChatModel, create_app + + +def main(): + chat_model = ChatModel() + app = create_app(chat_model) + print("Visit http://localhost:8000/docs for API document.") + uvicorn.run(app, host="0.0.0.0", port=8001, workers=1) + + +if __name__ == "__main__": + main() diff --git a/LLM-Detector-V4-11w/src/cli_demo.py b/LLM-Detector-V4-11w/src/cli_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..781aa551325406b5cdde1987bf2223f68b2e4f97 --- /dev/null +++ b/LLM-Detector-V4-11w/src/cli_demo.py @@ -0,0 +1,47 @@ +from llmtuner import ChatModel +from llmtuner.extras.misc import torch_gc + +try: + import platform + if platform.system() != "Windows": + import readline +except ImportError: + print("Install `readline` for a better experience.") + + +def main(): + chat_model = ChatModel() + history = [] + print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.") + + while True: + try: + query = input("\nUser: ") + except UnicodeDecodeError: + print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.") + continue + except Exception: + raise + + if query.strip() == "exit": + break + + if query.strip() == "clear": + history = [] + torch_gc() + print("History has been removed.") + continue + + print("Assistant: ", end="", flush=True) + + response = "" + for new_text in chat_model.stream_chat(query, history): + print(new_text, end="", flush=True) + response += new_text + print() + + history = history + [(query, response)] + + +if __name__ == "__main__": + main() diff --git a/LLM-Detector-V4-11w/src/evaluate.py b/LLM-Detector-V4-11w/src/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..13796c0c2d3ab6e83c6679d953e2017a3c519904 --- /dev/null +++ b/LLM-Detector-V4-11w/src/evaluate.py @@ -0,0 +1,10 @@ +from llmtuner import Evaluator + + +def main(): + evaluator = Evaluator() + evaluator.eval() + + +if __name__ == "__main__": + main() diff --git a/LLM-Detector-V4-11w/src/export_model.py b/LLM-Detector-V4-11w/src/export_model.py new file mode 100644 index 0000000000000000000000000000000000000000..4baeb2c3eb9423ace07a32bd019587596f6083cf --- /dev/null +++ b/LLM-Detector-V4-11w/src/export_model.py @@ -0,0 +1,9 @@ +from llmtuner import export_model + + +def main(): + export_model() + + +if __name__ == "__main__": + main() diff --git a/LLM-Detector-V4-11w/src/llmtuner/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..27b96d01ab3f9f429be2f730c4cf8874b3637c59 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/__init__.py @@ -0,0 +1,10 @@ +# Level: api, webui > chat, eval, train > data, model > extras, hparams + +from llmtuner.api import create_app +from llmtuner.chat import ChatModel +from llmtuner.eval import Evaluator +from llmtuner.train import export_model, run_exp +from llmtuner.webui import create_ui, create_web_demo + + +__version__ = "0.3.2" diff --git a/LLM-Detector-V4-11w/src/llmtuner/api/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b3ce183a5a14b791b19e506dd1086a625da81965 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/api/__init__.py @@ -0,0 +1 @@ +from llmtuner.api.app import create_app diff --git a/LLM-Detector-V4-11w/src/llmtuner/api/app.py b/LLM-Detector-V4-11w/src/llmtuner/api/app.py new file mode 100644 index 0000000000000000000000000000000000000000..c01fa0dfafef6052847d37249352e3b4bfe6fc14 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/api/app.py @@ -0,0 +1,165 @@ +import json +from typing import List, Tuple +from pydantic import BaseModel +from contextlib import asynccontextmanager + +from llmtuner.api.protocol import ( + Role, + Finish, + ModelCard, + ModelList, + ChatMessage, + DeltaMessage, + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionStreamResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionResponseUsage +) +from llmtuner.chat import ChatModel +from llmtuner.extras.misc import torch_gc +from llmtuner.extras.packages import ( + is_fastapi_availble, is_starlette_available, is_uvicorn_available +) + + +if is_fastapi_availble(): + from fastapi import FastAPI, HTTPException, status + from fastapi.middleware.cors import CORSMiddleware + + +if is_starlette_available(): + from sse_starlette import EventSourceResponse + + +if is_uvicorn_available(): + import uvicorn + + +@asynccontextmanager +async def lifespan(app: "FastAPI"): # collects GPU memory + yield + torch_gc() + + +def to_json(data: BaseModel) -> str: + try: # pydantic v2 + return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False) + except: # pydantic v1 + return data.json(exclude_unset=True, ensure_ascii=False) + + +def create_app(chat_model: "ChatModel") -> "FastAPI": + app = FastAPI(lifespan=lifespan) + + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + @app.get("/v1/models", response_model=ModelList) + async def list_models(): + model_card = ModelCard(id="gpt-3.5-turbo") + return ModelList(data=[model_card]) + + @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, status_code=status.HTTP_200_OK) + async def create_chat_completion(request: ChatCompletionRequest): + if len(request.messages) == 0 or request.messages[-1].role != Role.USER: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request") + + query = request.messages[-1].content + prev_messages = request.messages[:-1] + if len(prev_messages) and prev_messages[0].role == Role.SYSTEM: + system = prev_messages.pop(0).content + else: + system = None + + history = [] + if len(prev_messages) % 2 == 0: + for i in range(0, len(prev_messages), 2): + if prev_messages[i].role == Role.USER and prev_messages[i+1].role == Role.ASSISTANT: + history.append([prev_messages[i].content, prev_messages[i+1].content]) + else: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...") + else: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...") + + if request.stream: + generate = predict(query, history, system, request) + return EventSourceResponse(generate, media_type="text/event-stream") + + responses = chat_model.chat( + query, history, system, + do_sample=request.do_sample, + temperature=request.temperature, + top_p=request.top_p, + max_new_tokens=request.max_tokens, + num_return_sequences=request.n + ) + + prompt_length, response_length = 0, 0 + choices = [] + for i, response in enumerate(responses): + choices.append(ChatCompletionResponseChoice( + index=i, + message=ChatMessage(role=Role.ASSISTANT, content=response.response_text), + finish_reason=Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH + )) + prompt_length = response.prompt_length + response_length += response.response_length + + usage = ChatCompletionResponseUsage( + prompt_tokens=prompt_length, + completion_tokens=response_length, + total_tokens=prompt_length+response_length + ) + + return ChatCompletionResponse(model=request.model, choices=choices, usage=usage) + + async def predict(query: str, history: List[Tuple[str, str]], system: str, request: ChatCompletionRequest): + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(role=Role.ASSISTANT), + finish_reason=None + ) + chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data]) + yield to_json(chunk) + + for new_text in chat_model.stream_chat( + query, history, system, + do_sample=request.do_sample, + temperature=request.temperature, + top_p=request.top_p, + max_new_tokens=request.max_tokens + ): + if len(new_text) == 0: + continue + + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(content=new_text), + finish_reason=None + ) + chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data]) + yield to_json(chunk) + + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(), + finish_reason=Finish.STOP + ) + chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data]) + yield to_json(chunk) + yield "[DONE]" + + return app + + +if __name__ == "__main__": + chat_model = ChatModel() + app = create_app(chat_model) + uvicorn.run(app, host="0.0.0.0", port=8000, workers=1) diff --git a/LLM-Detector-V4-11w/src/llmtuner/api/protocol.py b/LLM-Detector-V4-11w/src/llmtuner/api/protocol.py new file mode 100644 index 0000000000000000000000000000000000000000..6b99da402eb19b79f901f99628313b4cecdd7283 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/api/protocol.py @@ -0,0 +1,83 @@ +import time +from enum import Enum +from pydantic import BaseModel, Field +from typing import List, Optional + + +class Role(str, Enum): + USER = "user" + ASSISTANT = "assistant" + SYSTEM = "system" + + +class Finish(str, Enum): + STOP = "stop" + LENGTH = "length" + + +class ModelCard(BaseModel): + id: str + object: Optional[str] = "model" + created: Optional[int] = Field(default_factory=lambda: int(time.time())) + owned_by: Optional[str] = "owner" + + +class ModelList(BaseModel): + object: Optional[str] = "list" + data: Optional[List[ModelCard]] = [] + + +class ChatMessage(BaseModel): + role: Role + content: str + + +class DeltaMessage(BaseModel): + role: Optional[Role] = None + content: Optional[str] = None + + +class ChatCompletionRequest(BaseModel): + model: str + messages: List[ChatMessage] + do_sample: Optional[bool] = True + temperature: Optional[float] = None + top_p: Optional[float] = None + n: Optional[int] = 1 + max_tokens: Optional[int] = None + stream: Optional[bool] = False + + +class ChatCompletionResponseChoice(BaseModel): + index: int + message: ChatMessage + finish_reason: Finish + + +class ChatCompletionResponseStreamChoice(BaseModel): + index: int + delta: DeltaMessage + finish_reason: Optional[Finish] = None + + +class ChatCompletionResponseUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class ChatCompletionResponse(BaseModel): + id: Optional[str] = "chatcmpl-default" + object: Optional[str] = "chat.completion" + created: Optional[int] = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[ChatCompletionResponseChoice] + usage: ChatCompletionResponseUsage + + +class ChatCompletionStreamResponse(BaseModel): + id: Optional[str] = "chatcmpl-default" + object: Optional[str] = "chat.completion.chunk" + created: Optional[int] = Field(default_factory=lambda: int(time.time())) + model: str + choices: List[ChatCompletionResponseStreamChoice] diff --git a/LLM-Detector-V4-11w/src/llmtuner/chat/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/chat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f86efe96374e7f6127cb87c7a26dd63fbb9171d5 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/chat/__init__.py @@ -0,0 +1 @@ +from llmtuner.chat.chat_model import ChatModel diff --git a/LLM-Detector-V4-11w/src/llmtuner/chat/chat_model.py b/LLM-Detector-V4-11w/src/llmtuner/chat/chat_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9966a8137b01805adb64dd2b378f3f9b8e2e266c --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/chat/chat_model.py @@ -0,0 +1,132 @@ +import torch +from dataclasses import dataclass +from typing import Any, Dict, Generator, List, Literal, Optional, Tuple +from threading import Thread +from transformers import GenerationConfig, TextIteratorStreamer + +from llmtuner.data.template import get_template_and_fix_tokenizer +from llmtuner.extras.misc import get_logits_processor +from llmtuner.model import dispatch_model, get_infer_args, load_model_and_tokenizer + + +@dataclass +class Response: + + response_text: str + response_length: int + prompt_length: int + finish_reason: Literal["stop", "length"] + + +class ChatModel: + + def __init__(self, args: Optional[Dict[str, Any]] = None) -> None: + model_args, data_args, finetuning_args, self.generating_args = get_infer_args(args) + self.model, self.tokenizer = load_model_and_tokenizer(model_args, finetuning_args) + self.tokenizer.padding_side = "left" + self.model = dispatch_model(self.model) + self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer) + self.system_prompt = data_args.system_prompt + + def _process_args( + self, + query: str, + history: Optional[List[Tuple[str, str]]] = None, + system: Optional[str] = None, + **input_kwargs + ) -> Tuple[Dict[str, Any], int]: + system = system or self.system_prompt + prompt, _ = self.template.encode_oneturn( + tokenizer=self.tokenizer, query=query, resp="", history=history, system=system + ) + prompt_length = len(prompt) + input_ids = torch.tensor([prompt], device=self.model.device) + + do_sample = input_kwargs.pop("do_sample", None) + temperature = input_kwargs.pop("temperature", None) + top_p = input_kwargs.pop("top_p", None) + top_k = input_kwargs.pop("top_k", None) + num_return_sequences = input_kwargs.pop("num_return_sequences", None) + repetition_penalty = input_kwargs.pop("repetition_penalty", None) + max_length = input_kwargs.pop("max_length", None) + max_new_tokens = input_kwargs.pop("max_new_tokens", None) + + generating_args = self.generating_args.to_dict() + generating_args.update(dict( + do_sample=do_sample if do_sample is not None else generating_args["do_sample"], + temperature=temperature or generating_args["temperature"], + top_p=top_p or generating_args["top_p"], + top_k=top_k or generating_args["top_k"], + num_return_sequences=num_return_sequences or 1, + repetition_penalty=repetition_penalty or generating_args["repetition_penalty"], + eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids, + pad_token_id=self.tokenizer.pad_token_id + )) + + if isinstance(num_return_sequences, int) and num_return_sequences > 1: + generating_args["do_sample"] = True + + if max_length: + generating_args.pop("max_new_tokens", None) + generating_args["max_length"] = max_length + + if max_new_tokens: + generating_args.pop("max_length", None) + generating_args["max_new_tokens"] = max_new_tokens + + gen_kwargs = dict( + inputs=input_ids, + generation_config=GenerationConfig(**generating_args), + logits_processor=get_logits_processor() + ) + + return gen_kwargs, prompt_length + + @torch.inference_mode() + def chat( + self, + query: str, + history: Optional[List[Tuple[str, str]]] = None, + system: Optional[str] = None, + **input_kwargs + ) -> List[Response]: + r""" + Args: query, history, system, **input_kwargs + + Returns: [(response_text, prompt_length, response_length)] * n (default n=1) + """ + gen_kwargs, prompt_length = self._process_args(query, history, system, **input_kwargs) + generate_output = self.model.generate(**gen_kwargs) + response_ids = generate_output[:, prompt_length:] + response = self.tokenizer.batch_decode( + response_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + results = [] + for i in range(len(response)): + eos_index = (response_ids[i] == self.tokenizer.eos_token_id).nonzero() + response_length = (eos_index[0].item() + 1) if len(eos_index) else len(response_ids[i]) + results.append(Response( + response_text=response[i], + response_length=response_length, + prompt_length=prompt_length, + finish_reason="stop" if len(eos_index) else "length" + )) + + return results + + @torch.inference_mode() + def stream_chat( + self, + query: str, + history: Optional[List[Tuple[str, str]]] = None, + system: Optional[str] = None, + **input_kwargs + ) -> Generator[str, None, None]: + gen_kwargs, _ = self._process_args(query, history, system, **input_kwargs) + streamer = TextIteratorStreamer(self.tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) + gen_kwargs["streamer"] = streamer + + thread = Thread(target=self.model.generate, kwargs=gen_kwargs) + thread.start() + + yield from streamer diff --git a/LLM-Detector-V4-11w/src/llmtuner/data/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35f7caa37ad84381d5ca0b028ad89a3316f50d01 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/data/__init__.py @@ -0,0 +1,4 @@ +from llmtuner.data.loader import get_dataset +from llmtuner.data.preprocess import preprocess_dataset +from llmtuner.data.template import get_template_and_fix_tokenizer +from llmtuner.data.utils import split_dataset diff --git a/LLM-Detector-V4-11w/src/llmtuner/data/loader.py b/LLM-Detector-V4-11w/src/llmtuner/data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..8e9053cac1caf7a8208f89cd4d7c940c62cc9d17 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/data/loader.py @@ -0,0 +1,148 @@ +import os +from typing import TYPE_CHECKING, Any, Dict, List, Union + +from datasets import concatenate_datasets, interleave_datasets, load_dataset + +from llmtuner.data.utils import checksum, EXT2TYPE +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from datasets import Dataset, IterableDataset + from llmtuner.hparams import ModelArguments, DataArguments + + +logger = get_logger(__name__) + + +def get_dataset( + model_args: "ModelArguments", + data_args: "DataArguments" +) -> Union["Dataset", "IterableDataset"]: + max_samples = data_args.max_samples + all_datasets: List[Union["Dataset", "IterableDataset"]] = [] # support multiple datasets + + for dataset_attr in data_args.dataset_list: + logger.info("Loading dataset {}...".format(dataset_attr)) + + if dataset_attr.load_from == "hf_hub": + data_path = dataset_attr.dataset_name + data_name = dataset_attr.subset + data_files = None + elif dataset_attr.load_from == "script": + data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name) + data_name = dataset_attr.subset + data_files = None + elif dataset_attr.load_from == "file": + data_path, data_name = None, None + data_files: List[str] = [] + if os.path.isdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is directory + for file_name in os.listdir(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): + data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name, file_name)) + if data_path is None: + data_path = EXT2TYPE.get(file_name.split(".")[-1], None) + else: + assert data_path == EXT2TYPE.get(file_name.split(".")[-1], None), "file types are not identical." + elif os.path.isfile(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)): # is file + data_files.append(os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)) + data_path = EXT2TYPE.get(dataset_attr.dataset_name.split(".")[-1], None) + else: + raise ValueError("File not found.") + + assert data_path, "File extension must be txt, csv, json or jsonl." + checksum(data_files, dataset_attr.dataset_sha1) + else: + raise NotImplementedError + + dataset = load_dataset( + path=data_path, + name=data_name, + data_files=data_files, + split=data_args.split, + cache_dir=model_args.cache_dir, + token=model_args.hf_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")) + ) + + if data_args.streaming and (dataset_attr.load_from == "file"): + dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter + + if max_samples is not None: # truncate dataset + dataset = dataset.select(range(min(len(dataset), max_samples))) + + def convert_format(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]: + # convert dataset from sharegpt format to alpaca format + outputs = {"prompt": [], "query": [], "response": [], "history": []} + for msg_list in examples[dataset_attr.messages]: + msg_list = msg_list[:len(msg_list) // 2 * 2] # should be multiples of 2 + if len(msg_list) == 0: + continue + + msg_pairs = [] + user_role, assistant_role = None, None + for idx in range(0, len(msg_list), 2): + if user_role is None and assistant_role is None: + user_role = msg_list[idx][dataset_attr.role] + assistant_role = msg_list[idx + 1][dataset_attr.role] + else: + if ( + msg_list[idx][dataset_attr.role] != user_role + or msg_list[idx+1][dataset_attr.role] != assistant_role + ): + raise ValueError("Only accepts conversation in u/a/u/a/u/a order.") + msg_pairs.append((msg_list[idx][dataset_attr.content], msg_list[idx + 1][dataset_attr.content])) + + if len(msg_pairs) != 0: + outputs["prompt"].append(msg_pairs[-1][0]) + outputs["query"].append("") + outputs["response"].append(msg_pairs[-1][1]) + outputs["history"].append(msg_pairs[:-1]) + + return outputs + + if dataset_attr.formatting == "sharegpt": # convert format + column_names = list(next(iter(dataset)).keys()) + kwargs = {} + if not data_args.streaming: + kwargs = dict( + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=(not data_args.overwrite_cache), + desc="Converting format of dataset" + ) + + dataset = dataset.map( + convert_format, + batched=True, + remove_columns=column_names, + **kwargs + ) + else: + for column_name in ["prompt", "query", "response", "history"]: # align dataset + if getattr(dataset_attr, column_name) and getattr(dataset_attr, column_name) != column_name: + dataset = dataset.rename_column(getattr(dataset_attr, column_name), column_name) + + if dataset_attr.system_prompt: # add system prompt + system_prompt = dataset_attr.system_prompt + if data_args.streaming: + dataset = dataset.map(lambda _: {"system": system_prompt}) + else: + dataset = dataset.add_column("system", [system_prompt] * len(dataset)) + + all_datasets.append(dataset) + + if len(data_args.dataset_list) == 1: + return all_datasets[0] + elif data_args.mix_strategy == "concat": + if data_args.streaming: + logger.warning("The samples between different datasets will not be mixed in streaming mode.") + return concatenate_datasets(all_datasets) + elif data_args.mix_strategy.startswith("interleave"): + if not data_args.streaming: + logger.warning("We recommend using `mix_strategy=concat` in non-streaming mode.") + return interleave_datasets( + datasets=all_datasets, + probabilities=data_args.interleave_probs, + seed=data_args.seed, + stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted" + ) + else: + raise ValueError("Unknown mixing strategy.") diff --git a/LLM-Detector-V4-11w/src/llmtuner/data/preprocess.py b/LLM-Detector-V4-11w/src/llmtuner/data/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..2d2b2db605d6d45bce0042c0f5b86142fba75808 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/data/preprocess.py @@ -0,0 +1,275 @@ +import os +import tiktoken +from itertools import chain +from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Tuple, Union + +from datasets import load_from_disk + +from llmtuner.data.template import get_template_and_fix_tokenizer +from llmtuner.extras.constants import IGNORE_INDEX +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from datasets import Dataset, IterableDataset + from transformers import Seq2SeqTrainingArguments + from transformers.tokenization_utils import PreTrainedTokenizer + from llmtuner.hparams import DataArguments + + +logger = get_logger(__name__) + + +def construct_example(examples: Dict[str, List[Any]]) -> Generator[Any, None, None]: + for i in range(len(examples["prompt"])): + query, response = examples["prompt"][i], examples["response"][i] + query = query + "\n" + examples["query"][i] if "query" in examples and examples["query"][i] else query + history = examples["history"][i] if "history" in examples else None + system = examples["system"][i] if "system" in examples else None + yield query, response, history, system + + +def infer_max_len(source_len: int, target_len: int, data_args: "DataArguments") -> Tuple[int, int]: + max_target_len = int(data_args.cutoff_len * (target_len / (source_len + target_len))) + max_target_len = max(max_target_len, data_args.reserved_label_len) + max_source_len = data_args.cutoff_len - max_target_len + return max_source_len, max_target_len + + +def preprocess_dataset( + dataset: Union["Dataset", "IterableDataset"], + tokenizer: "PreTrainedTokenizer", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo"] +) -> Union["Dataset", "IterableDataset"]: + template = get_template_and_fix_tokenizer(data_args.template, tokenizer) + + if data_args.train_on_prompt and template.efficient_eos: + raise ValueError("Current template does not support `train_on_prompt`.") + + def preprocess_pretrain_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]: + # build grouped texts with format `X1 X2 X3 ...` + if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen) + kwargs = dict(allowed_special="all") + else: + kwargs = dict(add_special_tokens=True) + + if hasattr(tokenizer, "add_eos_token"): # for LLaMA tokenizer + add_eos_token_flag = getattr(tokenizer, "add_eos_token") + setattr(tokenizer, "add_eos_token", True) + + tokenized_examples = tokenizer(examples["prompt"], **kwargs) + concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} + total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) + block_size = data_args.cutoff_len + # we drop the small remainder, and if the total_length < block_size, we exclude this batch + total_length = (total_length // block_size) * block_size + # split by chunks of cutoff_len + result = { + k: [t[i: i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + # make sure the saved tokenizer is the same as the original one + if hasattr(tokenizer, "add_eos_token"): + setattr(tokenizer, "add_eos_token", add_eos_token_flag) + return result + + def preprocess_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]: + # build inputs with format ` X Y ` and labels with format ` ... Y ` + # for multiturn examples, we only mask the prompt part in each prompt-response pair. + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + + for query, response, history, system in construct_example(examples): + if not (isinstance(query, str) and isinstance(response, str) and query != "" and response != ""): + continue + + input_ids, labels = [], [] + for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn( + tokenizer, query, response, history, system + )): + source_len, target_len = len(source_ids), len(target_ids) + max_source_len, max_target_len = infer_max_len(source_len, target_len, data_args) + if source_len > max_source_len: + source_ids = source_ids[:max_source_len] + if target_len > max_target_len: + target_ids = target_ids[:max_target_len] + + if data_args.train_on_prompt: + source_mask = source_ids + elif turn_idx != 0 and template.efficient_eos: + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) + else: + source_mask = [IGNORE_INDEX] * len(source_ids) + + input_ids += source_ids + target_ids + labels += source_mask + target_ids + + if template.efficient_eos: + input_ids += [tokenizer.eos_token_id] + labels += [tokenizer.eos_token_id] + + if len(input_ids) > data_args.cutoff_len: + input_ids = input_ids[:data_args.cutoff_len] + labels = labels[:data_args.cutoff_len] + + model_inputs["input_ids"].append(input_ids) + model_inputs["attention_mask"].append([1] * len(input_ids)) + model_inputs["labels"].append(labels) + + return model_inputs + + def preprocess_packed_supervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]: + # build inputs with format ` X1 Y1 X2 Y2 ` + # and labels with format ` ... Y1 ... Y2 ` + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + input_ids, labels = [], [] + for query, response, history, system in construct_example(examples): + if not (isinstance(query, str) and isinstance(response, str) and query != "" and response != ""): + continue + + for turn_idx, (source_ids, target_ids) in enumerate(template.encode_multiturn( + tokenizer, query, response, history, system + )): + if data_args.train_on_prompt: + source_mask = source_ids + elif turn_idx != 0 and template.efficient_eos: + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) + else: + source_mask = [IGNORE_INDEX] * len(source_ids) + input_ids += source_ids + target_ids + labels += source_mask + target_ids + + if template.efficient_eos: + input_ids += [tokenizer.eos_token_id] + labels += [tokenizer.eos_token_id] + + total_length = len(input_ids) + block_size = data_args.cutoff_len + # we drop the small remainder, and if the total_length < block_size, we exclude this batch + total_length = (total_length // block_size) * block_size + # split by chunks of cutoff_len + for i in range(0, total_length, block_size): + model_inputs["input_ids"].append(input_ids[i: i + block_size]) + model_inputs["attention_mask"].append([1] * block_size) + model_inputs["labels"].append(labels[i: i + block_size]) + + return model_inputs + + def preprocess_unsupervised_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]: + # build inputs with format ` X` and labels with format `Y ` + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + + for query, response, history, system in construct_example(examples): + if not (isinstance(query, str) and query != ""): + continue + + input_ids, labels = template.encode_oneturn(tokenizer, query, response, history, system) + + if template.efficient_eos: + labels += [tokenizer.eos_token_id] + + if len(input_ids) > data_args.cutoff_len: + input_ids = input_ids[:data_args.cutoff_len] + if len(labels) > data_args.cutoff_len: + labels = labels[:data_args.cutoff_len] + + model_inputs["input_ids"].append(input_ids) + model_inputs["attention_mask"].append([1] * len(input_ids)) + model_inputs["labels"].append(labels) + + return model_inputs + + def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]: + # build input pairs with format ` X`, `Y1 ` and `Y2 ` + model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} + for query, response, history, system in construct_example(examples): + if not (isinstance(query, str) and isinstance(response, list) and query != "" and len(response) > 1): + continue + + prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, query, response[0], history, system) + _, rejected_ids = template.encode_oneturn(tokenizer, query, response[1], history, system) + + if template.efficient_eos: + chosen_ids += [tokenizer.eos_token_id] + rejected_ids += [tokenizer.eos_token_id] + + source_len, target_len = len(prompt_ids), max(len(chosen_ids), len(rejected_ids)) + max_source_len, max_target_len = infer_max_len(source_len, target_len, data_args) + if source_len > max_source_len: + prompt_ids = prompt_ids[:max_source_len] + if target_len > max_target_len: + chosen_ids = chosen_ids[:max_target_len] + rejected_ids = rejected_ids[:max_target_len] + + model_inputs["prompt_ids"].append(prompt_ids) + model_inputs["chosen_ids"].append(chosen_ids) + model_inputs["rejected_ids"].append(rejected_ids) + + return model_inputs + + def print_supervised_dataset_example(example: Dict[str, List[int]]) -> None: + print("input_ids:\n{}".format(example["input_ids"])) + print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) + print("label_ids:\n{}".format(example["labels"])) + print("labels:\n{}".format( + tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False) + )) + + def print_pairwise_dataset_example(example: Dict[str, List[int]]) -> None: + print("prompt_ids:\n{}".format(example["prompt_ids"])) + print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False))) + print("chosen_ids:\n{}".format(example["chosen_ids"])) + print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False))) + print("rejected_ids:\n{}".format(example["rejected_ids"])) + print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False))) + + def print_unsupervised_dataset_example(example: Dict[str, List[int]]) -> None: + print("input_ids:\n{}".format(example["input_ids"])) + print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) + + if stage == "pt": + preprocess_func = preprocess_pretrain_dataset + print_function = print_unsupervised_dataset_example + elif stage == "sft" and not training_args.predict_with_generate: + preprocess_func = preprocess_packed_supervised_dataset if data_args.sft_packing else preprocess_supervised_dataset + print_function = print_supervised_dataset_example + elif stage == "rm": + preprocess_func = preprocess_pairwise_dataset + print_function = print_pairwise_dataset_example + else: + preprocess_func = preprocess_unsupervised_dataset + print_function = print_unsupervised_dataset_example + + if data_args.cache_path is not None and os.path.exists(data_args.cache_path): + logger.warning("Loading dataset from disk will ignore other data arguments.") + return load_from_disk(data_args.cache_path) + + with training_args.main_process_first(desc="dataset map pre-processing"): + column_names = list(next(iter(dataset)).keys()) + kwargs = {} + if not data_args.streaming: + kwargs = dict( + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=(not data_args.overwrite_cache), + desc="Running tokenizer on dataset" + ) + + dataset = dataset.map( + preprocess_func, + batched=True, + remove_columns=column_names, + **kwargs + ) + + if data_args.cache_path is not None and not os.path.exists(data_args.cache_path): + if training_args.should_save: + dataset.save_to_disk(data_args.cache_path) + raise SystemExit("Dataset saved, rerun this script with the same `--cache_path`.") + + if training_args.should_log: + try: + print_function(next(iter(dataset))) + except StopIteration: + raise RuntimeError("Empty dataset!") + + return dataset diff --git a/LLM-Detector-V4-11w/src/llmtuner/data/template.py b/LLM-Detector-V4-11w/src/llmtuner/data/template.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb633c5c0d5565d2f6cda1ed6a07a56acf28e8d --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/data/template.py @@ -0,0 +1,747 @@ +import tiktoken +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + + +logger = get_logger(__name__) + + +@dataclass +class Template: + + prefix: List[Union[str, Dict[str, str]]] + prompt: List[Union[str, Dict[str, str]]] + system: str + sep: List[Union[str, Dict[str, str]]] + stop_words: List[str] + use_history: bool + efficient_eos: bool + + def encode_oneturn( + self, + tokenizer: "PreTrainedTokenizer", + query: str, + resp: str, + history: Optional[List[Tuple[str, str]]] = None, + system: Optional[str] = None + ) -> Tuple[List[int], List[int]]: + r""" + Returns a single pair of token ids representing prompt and response respectively. + """ + system, history = self._format(query, resp, history, system) + encoded_pairs = self._encode(tokenizer, system, history) + prompt_ids = [] + for query_ids, resp_ids in encoded_pairs[:-1]: + prompt_ids = prompt_ids + query_ids + resp_ids + prompt_ids, answer_ids = prompt_ids + encoded_pairs[-1][0], encoded_pairs[-1][1] + return prompt_ids, answer_ids + + def encode_multiturn( + self, + tokenizer: "PreTrainedTokenizer", + query: str, + resp: str, + history: Optional[List[Tuple[str, str]]] = None, + system: Optional[str] = None + ) -> List[Tuple[List[int], List[int]]]: + r""" + Returns multiple pairs of token ids representing prompts and responses respectively. + """ + system, history = self._format(query, resp, history, system) + encoded_pairs = self._encode(tokenizer, system, history) + return encoded_pairs + + def _format( + self, + query: str, + resp: str, + history: Optional[List[Tuple[str, str]]] = None, + system: Optional[str] = None + ) -> Tuple[str, List[Tuple[str, str]]]: + r""" + Aligns inputs to the standard format. + """ + system = system or self.system # use system if provided + history = history if (history and self.use_history) else [] + history = history + [(query, resp)] + return system, history + + def _get_special_ids( + self, + tokenizer: "PreTrainedTokenizer" + ) -> Tuple[List[int], List[int]]: + if tokenizer.bos_token_id is not None and getattr(tokenizer, "add_bos_token", True): + bos_ids = [tokenizer.bos_token_id] + else: # baichuan, qwen and gpt2 models have no bos token + bos_ids = [] + + if tokenizer.eos_token_id is None: + raise ValueError("EOS token is required.") + + if self.efficient_eos: # used in baichuan, qwen, chatglm, etc. + eos_ids = [] + else: + eos_ids = [tokenizer.eos_token_id] + + return bos_ids, eos_ids + + def _encode( + self, + tokenizer: "PreTrainedTokenizer", + system: str, + history: List[Tuple[str, str]] + ) -> List[Tuple[List[int], List[int]]]: + r""" + Encodes formatted inputs to pairs of token ids. + Turn 0: bos + prefix + sep + query resp + eos + Turn t: sep + bos + query resp + eos + """ + bos_ids, eos_ids = self._get_special_ids(tokenizer) + sep_ids = self._convert_inputs_to_ids(tokenizer, context=self.sep) + encoded_pairs = [] + for turn_idx, (query, resp) in enumerate(history): + if turn_idx == 0: + prefix_ids = self._convert_inputs_to_ids(tokenizer, context=self.prefix, system=system) + if len(prefix_ids) != 0: # has prefix + prefix_ids = bos_ids + prefix_ids + sep_ids + else: + prefix_ids = bos_ids + else: + prefix_ids = sep_ids + bos_ids + + query_ids = self._convert_inputs_to_ids(tokenizer, context=self.prompt, query=query, idx=str(turn_idx+1)) + resp_ids = self._convert_inputs_to_ids(tokenizer, context=[resp]) + encoded_pairs.append((prefix_ids + query_ids, resp_ids + eos_ids)) + return encoded_pairs + + def _convert_inputs_to_ids( + self, + tokenizer: "PreTrainedTokenizer", + context: List[Union[str, Dict[str, str]]], + system: Optional[str] = None, + query: Optional[str] = None, + idx: Optional[str] = None + ) -> List[int]: + r""" + Converts context to token ids. + """ + if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen) + kwargs = dict(allowed_special="all") + else: + kwargs = dict(add_special_tokens=False) + + token_ids = [] + for elem in context: + if isinstance(elem, str): + elem = elem.replace("{{system}}", system, 1) if system is not None else elem + elem = elem.replace("{{query}}", query, 1) if query is not None else elem + elem = elem.replace("{{idx}}", idx, 1) if idx is not None else elem + if len(elem) != 0: + token_ids = token_ids + tokenizer.encode(elem, **kwargs) + elif isinstance(elem, dict): + token_ids = token_ids + [tokenizer.convert_tokens_to_ids(elem.get("token"))] + else: + raise ValueError("Input must be string or dict[str, str], got {}".format(type(elem))) + + return token_ids + + +@dataclass +class Llama2Template(Template): + + def _encode( + self, + tokenizer: "PreTrainedTokenizer", + system: str, + history: List[Tuple[str, str]] + ) -> List[Tuple[List[int], List[int]]]: + r""" + Encodes formatted inputs to pairs of token ids. + Turn 0: bos + prefix + query resp + eos + Turn t: bos + query resp + eos + """ + bos_ids, eos_ids = self._get_special_ids(tokenizer) + encoded_pairs = [] + for turn_idx, (query, resp) in enumerate(history): + if turn_idx == 0: # llama2 template has no sep_ids + query = self.prefix[0].replace("{{system}}", system) + query + query_ids = self._convert_inputs_to_ids(tokenizer, context=self.prompt, query=query) + resp_ids = self._convert_inputs_to_ids(tokenizer, context=[resp]) + encoded_pairs.append((bos_ids + query_ids, resp_ids + eos_ids)) + return encoded_pairs + + +templates: Dict[str, Template] = {} + + +def register_template( + name: str, + prefix: List[Union[str, Dict[str, str]]], + prompt: List[Union[str, Dict[str, str]]], + system: str, + sep: List[Union[str, Dict[str, str]]], + stop_words: Optional[List[str]] = [], + use_history: Optional[bool] = True, + efficient_eos: Optional[bool] = False +) -> None: + template_class = Llama2Template if "llama2" in name else Template + templates[name] = template_class( + prefix=prefix, + prompt=prompt, + system=system, + sep=sep, + stop_words=stop_words, + use_history=use_history, + efficient_eos=efficient_eos + ) + + +def get_template_and_fix_tokenizer( + name: str, + tokenizer: "PreTrainedTokenizer" +) -> Template: + if tokenizer.eos_token_id is None: + tokenizer.eos_token = "<|endoftext|>" + logger.info("Add eos token: {}".format(tokenizer.eos_token)) + + if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + logger.info("Add pad token: {}".format(tokenizer.pad_token)) + + if name is None: + return None + + template = templates.get(name, None) + assert template is not None, "Template {} does not exist.".format(name) + tokenizer.add_special_tokens( + dict(additional_special_tokens=template.stop_words), + replace_additional_special_tokens=False + ) + return template + + +register_template( + name="alpaca", + prefix=[ + "{{system}}" + ], + prompt=[ + "### Instruction:\n{{query}}\n\n### Response:\n" + ], + system=( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request." + ), + sep=[ + "\n\n" + ] +) + + +register_template( + name="aquila", + prefix=[ + "{{system}}" + ], + prompt=[ + "Human: {{query}}###Assistant:" + ], + system=( + "A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions." + ), + sep=[ + "###" + ], + stop_words=[ + "" + ], + efficient_eos=True +) + + +register_template( + name="baichuan", + prefix=[ + "{{system}}" + ], + prompt=[ + {"token": ""}, # user token + "{{query}}", + {"token": ""} # assistant token + ], + system="", + sep=[], + efficient_eos=True +) + + +register_template( + name="baichuan2", + prefix=[ + "{{system}}" + ], + prompt=[ + {"token": ""}, # user token + "{{query}}", + {"token": ""} # assistant token + ], + system="", + sep=[], + efficient_eos=True +) + + +register_template( + name="belle", + prefix=[ + "{{system}}" + ], + prompt=[ + "Human: {{query}}\n\nBelle: " + ], + system="", + sep=[ + "\n\n" + ] +) + + +register_template( + name="bluelm", + prefix=[ + "{{system}}" + ], + prompt=[ + {"token": "[|Human|]:"}, + "{{query}}", + {"token": "[|AI|]:"} + ], + system="", + sep=[] +) + + +register_template( + name="chatglm2", + prefix=[ + {"token": "[gMASK]"}, + {"token": "sop"}, + "{{system}}" + ], + prompt=[ + "[Round {{idx}}]\n\n问:{{query}}\n\n答:" + ], + system="", + sep=[ + "\n\n" + ], + efficient_eos=True +) + + +register_template( + name="chatglm3", + prefix=[ + {"token": "[gMASK]"}, + {"token": "sop"}, + {"token": "<|system|>"}, + "\n", + "{{system}}" + ], + prompt=[ + {"token": "<|user|>"}, + "\n", + "{{query}}", + {"token": "<|assistant|>"}, + "\n" # add an extra newline to avoid error in ChatGLM's process_response method + ], + system=( + "You are ChatGLM3, a large language model trained by Zhipu.AI. " + "Follow the user's instructions carefully. Respond using markdown." + ), + sep=[], + stop_words=[ + "<|user|>", + "<|observation|>" + ], + efficient_eos=True +) + + +register_template( + name="chatglm3_raw", # the raw template for tool tuning + prefix=[ + {"token": "[gMASK]"}, + {"token": "sop"}, + {"token": "<|system|>"}, + "\n", + "{{system}}" + ], + prompt=[ + {"token": "<|user|>"}, + "\n", + "{{query}}", + {"token": "<|assistant|>"} + ], + system=( + "You are ChatGLM3, a large language model trained by Zhipu.AI. " + "Follow the user's instructions carefully. Respond using markdown." + ), + sep=[], + stop_words=[ + "<|user|>", + "<|observation|>" + ], + efficient_eos=True +) + + +register_template( + name="deepseek", + prefix=[ + "{{system}}" + ], + prompt=[ + "User: {{query}}\n\nAssistant:" + ], + system="", + sep=[] +) + + +register_template( + name="deepseekcoder", + prefix=[ + "{{system}}" + ], + prompt=[ + "### Instruction:\n{{query}}\n### Response:\n" + ], + system=( + "You are an AI programming assistant, utilizing the Deepseek Coder model, " + "developed by Deepseek Company, and you only answer questions related to computer science. " + "For politically sensitive questions, security and privacy issues, " + "and other non-computer science questions, you will refuse to answer\n" + ), + sep=[ + "\n", + {"token": "<|EOT|>"}, + "\n" + ], + stop_words=[ + "<|EOT|>" + ], + efficient_eos=True +) + + +register_template( + name="default", + prefix=[ + "{{system}}" + ], + prompt=[ + "Human: {{query}}\nAssistant:" + ], + system=( + "A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions." + ), + sep=[ + "\n" + ] +) + + +register_template( + name="falcon", + prefix=[ + "{{system}}" + ], + prompt=[ + "User: {{query}}\nFalcon:" + ], + system="", + sep=[ + "\n" + ], + efficient_eos=True +) + + +register_template( + name="intern", + prefix=[ + "{{system}}" + ], + prompt=[ + "<|User|>:{{query}}", + {"token": ""}, + "\n<|Bot|>:" + ], + system="", + sep=[ + {"token": ""}, + "\n" + ], + stop_words=[ + "" + ], + efficient_eos=True +) + + +register_template( + name="llama2", + prefix=[ + "<>\n{{system}}\n<>\n\n" + ], + prompt=[ + "[INST] {{query}} [/INST]" + ], + system=( + "You are a helpful, respectful and honest assistant. " + "Always answer as helpfully as possible, while being safe. " + "Your answers should not include any harmful, unethical, " + "racist, sexist, toxic, dangerous, or illegal content. " + "Please ensure that your responses are socially unbiased and positive in nature.\n\n" + "If a question does not make any sense, or is not factually coherent, " + "explain why instead of answering something not correct. " + "If you don't know the answer to a question, please don't share false information." + ), + sep=[] +) + + +register_template( + name="llama2_zh", + prefix=[ + "<>\n{{system}}\n<>\n\n" + ], + prompt=[ + "[INST] {{query}} [/INST]" + ], + system="You are a helpful assistant. 你是一个乐于助人的助手。", + sep=[] +) + + +register_template( + name="mistral", + prefix=[ + "{{system}}" + ], + prompt=[ + "[INST] {{query}} [/INST]" + ], + system="", + sep=[ + " " + ] +) + + +register_template( + name="openchat", + prefix=[ + "{{system}}" + ], + prompt=[ + "GPT4 Correct User: {{query}}", + {"token": "<|end_of_turn|>"}, + "GPT4 Correct Assistant:" + ], + system="", + sep=[ + {"token": "<|end_of_turn|>"} + ], + stop_words=[ + "<|end_of_turn|>" + ], + efficient_eos=True +) + + +register_template( + name="qwen", + prefix=[ + {"token": "<|im_start|>"}, + "system\n{{system}}" + ], + prompt=[ + {"token": "<|im_start|>"}, + "user\n{{query}}", + {"token": "<|im_end|>"}, + "\n", + {"token": "<|im_start|>"}, + "assistant\n" + ], + system="You are a helpful assistant.", + sep=[ + {"token": "<|im_end|>"}, + "\n" + ], + stop_words=[ + "<|im_end|>" + ], + efficient_eos=True +) + + +register_template( + name="starchat", + prefix=[ + {"token": "<|system|>"}, + "\n{{system}}", + ], + prompt=[ + {"token": "<|user|>"}, + "\n{{query}}", + {"token": "<|end|>"}, + "\n", + {"token": "<|assistant|>"} + ], + system="", + sep=[ + {"token": "<|end|>"}, + "\n" + ], + stop_words=[ + "<|end|>" + ], + efficient_eos=True +) + + +r""" +Supports language model inference without histories. +""" +register_template( + name="vanilla", + prefix=[], + prompt=[ + "{{query}}" + ], + system="", + sep=[], + use_history=False +) + + +register_template( + name="vicuna", + prefix=[ + "{{system}}" + ], + prompt=[ + "USER: {{query}} ASSISTANT:" + ], + system=( + "A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions." + ), + sep=[] +) + + +register_template( + name="xverse", + prefix=[ + "{{system}}" + ], + prompt=[ + "Human: {{query}}\n\nAssistant: " + ], + system="", + sep=[] +) + + +register_template( + name="yayi", + prefix=[ + {"token": "<|System|>"}, + ":\n{{system}}" + ], + prompt=[ + {"token": "<|Human|>"}, + ":\n{{query}}\n\n", + {"token": "<|YaYi|>"}, + ":" + ], + system=( + "You are a helpful, respectful and honest assistant named YaYi " + "developed by Beijing Wenge Technology Co.,Ltd. " + "Always answer as helpfully as possible, while being safe. " + "Your answers should not include any harmful, unethical, " + "racist, sexist, toxic, dangerous, or illegal content. " + "Please ensure that your responses are socially unbiased and positive in nature.\n\n" + "If a question does not make any sense, or is not factually coherent, " + "explain why instead of answering something not correct. " + "If you don't know the answer to a question, please don't share false information." + ), + sep=[ + "\n\n" + ], + stop_words=[ + "<|End|>" + ] +) + + +register_template( + name="yi", + prefix=[ + "{{system}}" + ], + prompt=[ + "<|im_start|>user\n{{query}}<|im_end|>\n<|im_start|>assistant\n" + ], + system="", + sep=[ + "<|im_end|>\n" + ], + efficient_eos=True +) + + +register_template( + name="zephyr", + prefix=[ + {"token": "<|system|>"}, + "\n{{system}}", + {"token": ""} + ], + prompt=[ + {"token": "<|user|>"}, + "\n{{query}}", + {"token": ""}, + {"token": "<|assistant|>"} + ], + system="You are a friendly chatbot who always responds in the style of a pirate", + sep=[] +) + + +register_template( + name="ziya", + prefix=[ + "{{system}}" + ], + prompt=[ + {"token": ""}, + ":{{query}}\n", + {"token": ""}, + ":" + ], + system="", + sep=[ + "\n" + ] +) diff --git a/LLM-Detector-V4-11w/src/llmtuner/data/utils.py b/LLM-Detector-V4-11w/src/llmtuner/data/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fecadce4f52bfe89fd4f7b9c8689a5c308d3c410 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/data/utils.py @@ -0,0 +1,61 @@ +import hashlib +from typing import TYPE_CHECKING, Dict, List, Optional, Union + +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from datasets import Dataset, IterableDataset + from transformers import TrainingArguments + from llmtuner.hparams import DataArguments + + +logger = get_logger(__name__) + + +EXT2TYPE = { + "arrow": "arrow", + "csv": "csv", + "json": "json", + "jsonl": "json", + "parquet": "parquet", + "txt": "text" +} + + +def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: + if file_sha1 is None: + logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.") + return + + if len(data_files) != 1: + logger.warning("Checksum failed: too many files.") + return + + with open(data_files[0], "rb") as f: + sha1 = hashlib.sha1(f.read()).hexdigest() + if sha1 != file_sha1: + logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0])) + + +def split_dataset( + dataset: Union["Dataset", "IterableDataset"], + data_args: "DataArguments", + training_args: "TrainingArguments" +) -> Dict[str, "Dataset"]: + if training_args.do_train: + if data_args.val_size > 1e-6: # Split the dataset + if data_args.streaming: + val_set = dataset.take(int(data_args.val_size)) + train_set = dataset.skip(int(data_args.val_size)) + dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed) + return {"train_dataset": train_set, "eval_dataset": val_set} + else: + val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size + dataset = dataset.train_test_split(test_size=val_size, seed=training_args.seed) + return {"train_dataset": dataset["train"], "eval_dataset": dataset["test"]} + else: + if data_args.streaming: + dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed) + return {"train_dataset": dataset} + else: # do_eval or do_predict + return {"eval_dataset": dataset} diff --git a/LLM-Detector-V4-11w/src/llmtuner/eval/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a7c9a12751a55f5cfab406c0840b6f5221d8fac9 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/eval/__init__.py @@ -0,0 +1 @@ +from llmtuner.eval.evaluator import Evaluator diff --git a/LLM-Detector-V4-11w/src/llmtuner/eval/evaluator.py b/LLM-Detector-V4-11w/src/llmtuner/eval/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..1fbd40eecda35e060c755e9aaa11b9beefeb0e79 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/eval/evaluator.py @@ -0,0 +1,124 @@ +# Inspired by: https://github.com/hendrycks/test/blob/master/evaluate_flan.py + +import os +import json +import torch +import inspect +import tiktoken +import numpy as np +from tqdm import tqdm, trange +from typing import Any, Dict, List, Optional + +from datasets import load_dataset +from transformers.utils import cached_file + +from llmtuner.data.template import get_template_and_fix_tokenizer +from llmtuner.eval.template import get_eval_template +from llmtuner.extras.constants import CHOICES, SUBJECTS +from llmtuner.model import dispatch_model, get_eval_args, load_model_and_tokenizer + + +class Evaluator: + + def __init__(self, args: Optional[Dict[str, Any]] = None) -> None: + self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args) + self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args) + self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2 + self.model = dispatch_model(self.model) + self.template = get_template_and_fix_tokenizer(self.data_args.template, self.tokenizer) + self.eval_template = get_eval_template(self.eval_args.lang) + self.choice_inputs = self._encode_choices() + + def _encode_choices(self) -> List[int]: + if isinstance(getattr(self.tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen) + kwargs = dict(allowed_special="all") + else: + kwargs = dict(add_special_tokens=False) + + return [self.tokenizer.encode(self.eval_template.prefix + ch, **kwargs)[-1] for ch in CHOICES] + + @torch.inference_mode() + def batch_inference(self, batch_input: Dict[str, torch.Tensor]) -> List[str]: + logits = self.model(**batch_input).logits + lengths = torch.sum(batch_input["attention_mask"], dim=-1) + word_probs = torch.stack([logits[i, lengths[i] - 1] for i in range(len(lengths))], dim=0) + choice_probs = torch.nn.functional.softmax(word_probs[:, self.choice_inputs], dim=-1).detach() + return [chr(ord("A") + offset.item()) for offset in torch.argmax(choice_probs, dim=-1)] + + def eval(self) -> None: + if "token" in inspect.signature(cached_file).parameters: + kwargs = {"token": self.model_args.hf_hub_token} + elif "use_auth_token" in inspect.signature(cached_file).parameters: # for transformers==4.31.0 + kwargs = {"use_auth_token": self.model_args.hf_hub_token} + + mapping = cached_file( + path_or_repo_id = os.path.join(self.eval_args.task_dir, self.eval_args.task), + filename="mapping.json", + cache_dir=self.model_args.cache_dir, + **kwargs + ) + + with open(mapping, "r", encoding="utf-8") as f: + categorys: Dict[str, Dict[str, str]] = json.load(f) + + category_corrects = {subj: np.array([], dtype="bool") for subj in SUBJECTS} + pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0) + results = {} + for subject in pbar: + dataset = load_dataset( + path=os.path.join(self.eval_args.task_dir, self.eval_args.task), + name=subject, + cache_dir=self.model_args.cache_dir, + download_mode=self.eval_args.download_mode, + token=self.model_args.hf_hub_token + ) + pbar.set_postfix_str(categorys[subject]["name"]) + inputs, outputs, labels = [], [], [] + for i in trange(len(dataset[self.data_args.split]), desc="Formatting batches", position=1, leave=False): + support_set = dataset["train"].shuffle().select(range(min(self.eval_args.n_shot, len(dataset["train"])))) + query, resp, history = self.eval_template.format_example( + target_data=dataset[self.data_args.split][i], + support_set=support_set, + subject_name=categorys[subject]["name"], + use_history=self.template.use_history + ) + input_ids, _ = self.template.encode_oneturn( + tokenizer=self.tokenizer, query=query, resp=resp, history=history + ) + inputs.append({"input_ids": input_ids, "attention_mask": [1] * len(input_ids)}) + labels.append(resp) + + for i in trange(0, len(inputs), self.eval_args.batch_size, desc="Predicting batches", position=1, leave=False): + batch_input = self.tokenizer.pad( + inputs[i : i + self.eval_args.batch_size], return_attention_mask=True, return_tensors="pt" + ).to(self.model.device) + preds = self.batch_inference(batch_input) + outputs += preds + + corrects = (np.array(outputs) == np.array(labels)) + category_name = categorys[subject]["category"] + category_corrects[category_name] = np.concatenate([category_corrects[category_name], corrects], axis=0) + category_corrects["Average"] = np.concatenate([category_corrects["Average"], corrects], axis=0) + results[subject] = {str(i): outputs[i] for i in range(len(outputs))} + + pbar.close() + self._save_results(category_corrects, results) + + def _save_results(self, category_corrects: Dict[str, np.ndarray], results: Dict[str, Dict[int, str]]) -> None: + score_info = "\n".join([ + "{:>15}: {:.2f}".format(category_name, 100 * np.mean(category_correct)) + for category_name, category_correct in category_corrects.items() if len(category_correct) + ]) + print(score_info) + if self.eval_args.save_dir is not None: + os.makedirs(self.eval_args.save_dir, exist_ok=False) + with open(os.path.join(self.eval_args.save_dir, "results.json"), "w", encoding="utf-8", newline="\n") as f: + json.dump(results, f, indent=2) + + with open(os.path.join(self.eval_args.save_dir, "results.log"), "w", encoding="utf-8", newline="\n") as f: + f.write(score_info) + + +if __name__ == "__main__": + evaluator = Evaluator() + evaluator.eval() diff --git a/LLM-Detector-V4-11w/src/llmtuner/eval/template.py b/LLM-Detector-V4-11w/src/llmtuner/eval/template.py new file mode 100644 index 0000000000000000000000000000000000000000..2251ad5785095b387a5029c2e2763a51f8a7f0f2 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/eval/template.py @@ -0,0 +1,86 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Tuple + +from llmtuner.extras.constants import CHOICES + +if TYPE_CHECKING: + from datasets import Dataset + + +@dataclass +class EvalTemplate: + + system: str + choice: str + answer: str + prefix: str + + def parse_example( + self, + example: Dict[str, str] + ) -> Tuple[str, str]: + candidates = [self.choice.format(choice=ch, content=example[ch]) for ch in CHOICES if ch in example] + return "".join([example["question"]] + candidates + [self.answer]), example["answer"] + + def format_example( + self, + target_data: Dict[str, str], + support_set: "Dataset", + subject_name: str, + use_history: bool + ) -> Tuple[str, str, List[Tuple[str, str]]]: + query, resp = self.parse_example(target_data) + history = [self.parse_example(support_set[k]) for k in range(len(support_set))] + + if len(history): + temp = history.pop(0) + history.insert(0, (self.system.format(subject=subject_name) + temp[0], temp[1])) + else: + query = self.system.format(subject=subject_name) + query + + if not use_history: + query = "\n\n".join(["".join(item) for item in history] + [query]) + history = [] + return query.strip(), resp, history + + +eval_templates: Dict[str, EvalTemplate] = {} + + +def register_eval_template( + name: str, + system: str, + choice: str, + answer: str, + prefix: str +) -> None: + eval_templates[name] = EvalTemplate( + system=system, + choice=choice, + answer=answer, + prefix=prefix + ) + + +def get_eval_template(name: str) -> EvalTemplate: + eval_template = eval_templates.get(name, None) + assert eval_template is not None, "Template {} does not exist.".format(name) + return eval_template + + +register_eval_template( + name="en", + system="The following are multiple choice questions (with answers) about {subject}.\n\n", + choice="\n{choice}. {content}", + answer="\nAnswer: ", + prefix=" " +) + + +register_eval_template( + name="zh", + system="以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n", + choice="\n{choice}. {content}", + answer="\n答案:", + prefix="\n" +) diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/extras/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/callbacks.py b/LLM-Detector-V4-11w/src/llmtuner/extras/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..fd78391dd090fed92e1388efeee25cb2236466cd --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/callbacks.py @@ -0,0 +1,165 @@ +import os +import json +import time +from typing import TYPE_CHECKING +from datetime import timedelta + +from transformers import TrainerCallback +from transformers.modeling_utils import custom_object_save, unwrap_model +from transformers.trainer_utils import has_length, PREFIX_CHECKPOINT_DIR + +from llmtuner.extras.constants import LOG_FILE_NAME +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from transformers import TrainingArguments, TrainerState, TrainerControl + from trl import AutoModelForCausalLMWithValueHead + + +logger = get_logger(__name__) + + +def _save_model_with_valuehead(model: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None: + model.pretrained_model.config.save_pretrained(output_dir) + if model.pretrained_model.can_generate(): + model.pretrained_model.generation_config.save_pretrained(output_dir) + if getattr(model, "is_peft_model", False): + model.pretrained_model.save_pretrained(output_dir) + elif getattr(model.pretrained_model, "_auto_class", None): # must not a peft model + custom_object_save(model.pretrained_model, output_dir, config=model.pretrained_model.config) + + +class SavePeftModelCallback(TrainerCallback): + + def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after a checkpoint save. + """ + if args.should_save: + _save_model_with_valuehead( + model=unwrap_model(kwargs.pop("model")), + output_dir=os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step)) + ) + + def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of training. + """ + if args.should_save: + _save_model_with_valuehead(model=unwrap_model(kwargs.pop("model")), output_dir=args.output_dir) + + +class LogCallback(TrainerCallback): + + def __init__(self, runner=None): + self.runner = runner + self.in_training = False + self.start_time = time.time() + self.cur_steps = 0 + self.max_steps = 0 + self.elapsed_time = "" + self.remaining_time = "" + + def timing(self): + cur_time = time.time() + elapsed_time = cur_time - self.start_time + avg_time_per_step = elapsed_time / self.cur_steps if self.cur_steps != 0 else 0 + remaining_time = (self.max_steps - self.cur_steps) * avg_time_per_step + self.elapsed_time = str(timedelta(seconds=int(elapsed_time))) + self.remaining_time = str(timedelta(seconds=int(remaining_time))) + + def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the beginning of training. + """ + if state.is_local_process_zero: + self.in_training = True + self.start_time = time.time() + self.max_steps = state.max_steps + if os.path.exists(os.path.join(args.output_dir, LOG_FILE_NAME)) and args.overwrite_output_dir: + logger.warning("Previous log file in this folder will be deleted.") + os.remove(os.path.join(args.output_dir, LOG_FILE_NAME)) + + def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of training. + """ + if state.is_local_process_zero: + self.in_training = False + self.cur_steps = 0 + self.max_steps = 0 + + def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of an substep during gradient accumulation. + """ + if state.is_local_process_zero and self.runner is not None and self.runner.aborted: + control.should_epoch_stop = True + control.should_training_stop = True + + def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of a training step. + """ + if state.is_local_process_zero: + self.cur_steps = state.global_step + self.timing() + if self.runner is not None and self.runner.aborted: + control.should_epoch_stop = True + control.should_training_stop = True + + def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after an evaluation phase. + """ + if state.is_local_process_zero and not self.in_training: + self.cur_steps = 0 + self.max_steps = 0 + + def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", *other, **kwargs): + r""" + Event called after a successful prediction. + """ + if state.is_local_process_zero and not self.in_training: + self.cur_steps = 0 + self.max_steps = 0 + + def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs) -> None: + r""" + Event called after logging the last logs. + """ + if not state.is_local_process_zero: + return + + logs = dict( + current_steps=self.cur_steps, + total_steps=self.max_steps, + loss=state.log_history[-1].get("loss", None), + eval_loss=state.log_history[-1].get("eval_loss", None), + predict_loss=state.log_history[-1].get("predict_loss", None), + reward=state.log_history[-1].get("reward", None), + learning_rate=state.log_history[-1].get("learning_rate", None), + epoch=state.log_history[-1].get("epoch", None), + percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100, + elapsed_time=self.elapsed_time, + remaining_time=self.remaining_time + ) + if self.runner is not None: + logger.info("{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}".format( + logs["loss"] or 0, logs["learning_rate"] or 0, logs["epoch"] or 0 + )) + + os.makedirs(args.output_dir, exist_ok=True) + with open(os.path.join(args.output_dir, "trainer_log.jsonl"), "a", encoding="utf-8") as f: + f.write(json.dumps(logs) + "\n") + + def on_prediction_step(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after a prediction step. + """ + eval_dataloader = kwargs.pop("eval_dataloader", None) + if state.is_local_process_zero and has_length(eval_dataloader) and not self.in_training: + if self.max_steps == 0: + self.max_steps = len(eval_dataloader) + self.cur_steps += 1 + self.timing() diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/constants.py b/LLM-Detector-V4-11w/src/llmtuner/extras/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..a36102a1b10bc805c3a5f5c33657cac780d625d2 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/constants.py @@ -0,0 +1,587 @@ +from enum import Enum +from collections import defaultdict, OrderedDict +from typing import Dict, Optional + + +CHOICES = ["A", "B", "C", "D"] + +DEFAULT_MODULE = defaultdict(str) + +DEFAULT_TEMPLATE = defaultdict(str) + +IGNORE_INDEX = -100 + +LAYERNORM_NAMES = {"norm", "ln"} + +LOG_FILE_NAME = "trainer_log.jsonl" + +METHODS = ["full", "freeze", "lora"] + +SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"] + +SUPPORTED_MODELS = OrderedDict() + +TRAINING_STAGES = { + "Supervised Fine-Tuning": "sft", + "Reward Modeling": "rm", + "PPO": "ppo", + "DPO": "dpo", + "Pre-Training": "pt" +} + +class DownloadSource(str, Enum): + DEFAULT = "hf" + MODELSCOPE = "ms" + + +def register_model_group( + models: Dict[str, Dict[DownloadSource, str]], + module: Optional[str] = None, + template: Optional[str] = None +) -> None: + prefix = None + for name, path in models.items(): + if prefix is None: + prefix = name.split("-")[0] + else: + assert prefix == name.split("-")[0], "prefix should be identical." + SUPPORTED_MODELS[name] = path + if module is not None: + DEFAULT_MODULE[prefix] = module + if template is not None: + DEFAULT_TEMPLATE[prefix] = template + + +register_model_group( + models={ + "Baichuan-7B-Base": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan-7B", + DownloadSource.MODELSCOPE: "baichuan-inc/baichuan-7B" + }, + "Baichuan-13B-Base": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Base", + DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Base" + }, + "Baichuan-13B-Chat": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan-13B-Chat", + DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Chat" + } + }, + module="W_pack", + template="baichuan" +) + + +register_model_group( + models={ + "Baichuan2-7B-Base": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Base", + DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Base" + }, + "Baichuan2-13B-Base": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Base", + DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Base" + }, + "Baichuan2-7B-Chat": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-7B-Chat", + DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-7B-Chat" + }, + "Baichuan2-13B-Chat": { + DownloadSource.DEFAULT: "baichuan-inc/Baichuan2-13B-Chat", + DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Chat" + } + }, + module="W_pack", + template="baichuan2" +) + + +register_model_group( + models={ + "BLOOM-560M": { + DownloadSource.DEFAULT: "bigscience/bloom-560m", + DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-560m" + }, + "BLOOM-3B": { + DownloadSource.DEFAULT: "bigscience/bloom-3b", + DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-3b" + }, + "BLOOM-7B1": { + DownloadSource.DEFAULT: "bigscience/bloom-7b1", + DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-7b1" + } + }, + module="query_key_value" +) + + +register_model_group( + models={ + "BLOOMZ-560M": { + DownloadSource.DEFAULT: "bigscience/bloomz-560m", + DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-560m" + }, + "BLOOMZ-3B": { + DownloadSource.DEFAULT: "bigscience/bloomz-3b", + DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-3b" + }, + "BLOOMZ-7B1-mt": { + DownloadSource.DEFAULT: "bigscience/bloomz-7b1-mt", + DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-7b1-mt" + } + }, + module="query_key_value" +) + + +register_model_group( + models={ + "BlueLM-7B-Base": { + DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Base", + DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Base" + }, + "BlueLM-7B-Chat": { + DownloadSource.DEFAULT: "vivo-ai/BlueLM-7B-Chat", + DownloadSource.MODELSCOPE: "vivo-ai/BlueLM-7B-Chat" + } + }, + template="bluelm" +) + + +register_model_group( + models={ + "ChatGLM2-6B-Chat": { + DownloadSource.DEFAULT: "THUDM/chatglm2-6b", + DownloadSource.MODELSCOPE: "ZhipuAI/chatglm2-6b" + } + }, + module="query_key_value", + template="chatglm2" +) + + +register_model_group( + models={ + "ChatGLM3-6B-Base": { + DownloadSource.DEFAULT: "THUDM/chatglm3-6b-base", + DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b-base" + }, + "ChatGLM3-6B-Chat": { + DownloadSource.DEFAULT: "THUDM/chatglm3-6b", + DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b" + } + }, + module="query_key_value", + template="chatglm3" +) + + +register_model_group( + models={ + "ChineseLLaMA2-1.3B": { + DownloadSource.DEFAULT: "hfl/chinese-llama-2-1.3b", + DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-1.3b" + }, + "ChineseLLaMA2-7B": { + DownloadSource.DEFAULT: "hfl/chinese-llama-2-7b", + DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-7b" + }, + "ChineseLLaMA2-13B": { + DownloadSource.DEFAULT: "hfl/chinese-llama-2-13b", + DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-llama-2-13b" + }, + "ChineseLLaMA2-1.3B-Chat": { + DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-1.3b", + DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-1.3b" + }, + "ChineseLLaMA2-7B-Chat": { + DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-7b", + DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-7b" + }, + "ChineseLLaMA2-13B-Chat": { + DownloadSource.DEFAULT: "hfl/chinese-alpaca-2-13b", + DownloadSource.MODELSCOPE: "AI-ModelScope/chinese-alpaca-2-13b" + } + }, + template="llama2_zh" +) + + +register_model_group( + models={ + "DeepseekLLM-7B-Base": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-base", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-base" + }, + "DeepseekLLM-67B-Base": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-base", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-base" + }, + "DeepseekLLM-7B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-7b-chat", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-7b-chat" + }, + "DeepseekLLM-67B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-llm-67b-chat", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-llm-67b-chat" + } + }, + template="deepseek" +) + + +register_model_group( + models={ + "DeepseekCoder-6.7B-Base": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-base", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-base" + }, + "DeepseekCoder-33B-Base": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-base", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-base" + }, + "DeepseekCoder-6.7B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-6.7b-instruct", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-6.7b-instruct" + }, + "DeepseekCoder-33B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/deepseek-coder-33b-instruct", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-coder-33b-instruct" + } + }, + template="deepseekcoder" +) + + +register_model_group( + models={ + "Falcon-7B": { + DownloadSource.DEFAULT: "tiiuae/falcon-7b", + DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b" + }, + "Falcon-40B": { + DownloadSource.DEFAULT: "tiiuae/falcon-40b", + DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b" + }, + "Falcon-180B": { + DownloadSource.DEFAULT: "tiiuae/falcon-180b", + DownloadSource.MODELSCOPE: "modelscope/falcon-180B" + }, + "Falcon-7B-Chat": { + DownloadSource.DEFAULT: "tiiuae/falcon-7b-instruct", + DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b-instruct" + }, + "Falcon-40B-Chat": { + DownloadSource.DEFAULT: "tiiuae/falcon-40b-instruct", + DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b-instruct" + }, + "Falcon-180B-Chat": { + DownloadSource.DEFAULT: "tiiuae/falcon-180b-chat", + DownloadSource.MODELSCOPE: "modelscope/falcon-180B-chat" + } + }, + module="query_key_value", + template="falcon" +) + + +register_model_group( + models={ + "InternLM-7B": { + DownloadSource.DEFAULT: "internlm/internlm-7b", + DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-7b" + }, + "InternLM-20B": { + DownloadSource.DEFAULT: "internlm/internlm-20b", + DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-20b" + }, + "InternLM-7B-Chat": { + DownloadSource.DEFAULT: "internlm/internlm-chat-7b", + DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-7b" + }, + "InternLM-20B-Chat": { + DownloadSource.DEFAULT: "internlm/internlm-chat-20b", + DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm-chat-20b" + } + }, + template="intern" +) + + +register_model_group( + models={ + "LingoWhale-8B": { + DownloadSource.DEFAULT: "deeplang-ai/LingoWhale-8B", + DownloadSource.MODELSCOPE: "DeepLang/LingoWhale-8B" + } + }, + module="qkv_proj" +) + + +register_model_group( + models={ + "LLaMA-7B": { + DownloadSource.DEFAULT: "huggyllama/llama-7b", + DownloadSource.MODELSCOPE: "skyline2006/llama-7b" + }, + "LLaMA-13B": { + DownloadSource.DEFAULT: "huggyllama/llama-13b", + DownloadSource.MODELSCOPE: "skyline2006/llama-13b" + }, + "LLaMA-30B": { + DownloadSource.DEFAULT: "huggyllama/llama-30b", + DownloadSource.MODELSCOPE: "skyline2006/llama-30b" + }, + "LLaMA-65B": { + DownloadSource.DEFAULT: "huggyllama/llama-65b", + DownloadSource.MODELSCOPE: "skyline2006/llama-65b" + } + } +) + + +register_model_group( + models={ + "LLaMA2-7B": { + DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-hf", + DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-ms" + }, + "LLaMA2-13B": { + DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-hf", + DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-ms" + }, + "LLaMA2-70B": { + DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-hf", + DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-ms" + }, + "LLaMA2-7B-Chat": { + DownloadSource.DEFAULT: "meta-llama/Llama-2-7b-chat-hf", + DownloadSource.MODELSCOPE: "modelscope/Llama-2-7b-chat-ms" + }, + "LLaMA2-13B-Chat": { + DownloadSource.DEFAULT: "meta-llama/Llama-2-13b-chat-hf", + DownloadSource.MODELSCOPE: "modelscope/Llama-2-13b-chat-ms" + }, + "LLaMA2-70B-Chat": { + DownloadSource.DEFAULT: "meta-llama/Llama-2-70b-chat-hf", + DownloadSource.MODELSCOPE: "modelscope/Llama-2-70b-chat-ms" + } + }, + template="llama2" +) + + +register_model_group( + models={ + "Mistral-7B": { + DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1", + DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1" + }, + "Mistral-7B-Chat": { + DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1", + DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1" + } + }, + template="mistral" +) + + +register_model_group( + models={ + "OpenChat3.5-7B-Chat": { + DownloadSource.DEFAULT: "openchat/openchat_3.5", + DownloadSource.MODELSCOPE: "myxiongmodel/openchat_3.5" + } + }, + template="openchat" +) + + +register_model_group( + models={ + "Phi1.5-1.3B": { + DownloadSource.DEFAULT: "microsoft/phi-1_5", + DownloadSource.MODELSCOPE: "allspace/PHI_1-5" + } + }, + module="Wqkv" +) + + +register_model_group( + models={ + "Qwen-1.8B": { + DownloadSource.DEFAULT: "Qwen/Qwen-1_8B", + DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B" + }, + "Qwen-7B": { + DownloadSource.DEFAULT: "Qwen/Qwen-7B", + DownloadSource.MODELSCOPE: "qwen/Qwen-7B" + }, + "Qwen-14B": { + DownloadSource.DEFAULT: "Qwen/Qwen-14B", + DownloadSource.MODELSCOPE: "qwen/Qwen-14B" + }, + "Qwen-72B": { + DownloadSource.DEFAULT: "Qwen/Qwen-72B", + DownloadSource.MODELSCOPE: "qwen/Qwen-72B" + }, + "Qwen-1.8B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat" + }, + "Qwen-7B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat" + }, + "Qwen-14B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat" + }, + "Qwen-72B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat" + }, + "Qwen-1.8B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat-Int8" + }, + "Qwen-1.8B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-1_8B-Chat-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen-1_8B-Chat-Int4" + }, + "Qwen-7B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat-Int8" + }, + "Qwen-7B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-7B-Chat-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen-7B-Chat-Int4" + }, + "Qwen-14B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat-Int8" + }, + "Qwen-14B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-14B-Chat-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen-14B-Chat-Int4" + }, + "Qwen-72B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int8" + }, + "Qwen-72B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-72B-Chat-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int4" + } + }, + module="c_attn", + template="qwen" +) + + +register_model_group( + models={ + "Skywork-13B-Base": { + DownloadSource.DEFAULT: "Skywork/Skywork-13B-base", + DownloadSource.MODELSCOPE: "skywork/Skywork-13B-base" + } + } +) + + +register_model_group( + models={ + "Vicuna1.5-7B-Chat": { + DownloadSource.DEFAULT: "lmsys/vicuna-7b-v1.5", + DownloadSource.MODELSCOPE: "Xorbits/vicuna-7b-v1.5" + }, + "Vicuna1.5-13B-Chat": { + DownloadSource.DEFAULT: "lmsys/vicuna-13b-v1.5", + DownloadSource.MODELSCOPE: "Xorbits/vicuna-13b-v1.5" + } + }, + template="vicuna" +) + + +register_model_group( + models={ + "XVERSE-7B": { + DownloadSource.DEFAULT: "xverse/XVERSE-7B", + DownloadSource.MODELSCOPE: "xverse/XVERSE-7B" + }, + "XVERSE-13B": { + DownloadSource.DEFAULT: "xverse/XVERSE-13B", + DownloadSource.MODELSCOPE: "xverse/XVERSE-13B" + }, + "XVERSE-65B": { + DownloadSource.DEFAULT: "xverse/XVERSE-65B", + DownloadSource.MODELSCOPE: "xverse/XVERSE-65B" + }, + "XVERSE-7B-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat", + DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat" + }, + "XVERSE-13B-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat", + DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat" + } + }, + template="xverse" +) + + +register_model_group( + models={ + "Yayi-7B": { + DownloadSource.DEFAULT: "wenge-research/yayi-7b-llama2", + DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-7b-llama2" + }, + "Yayi-13B": { + DownloadSource.DEFAULT: "wenge-research/yayi-13b-llama2", + DownloadSource.MODELSCOPE: "AI-ModelScope/yayi-13b-llama2" + } + }, + template="yayi" +) + + +register_model_group( + models={ + "Yi-6B": { + DownloadSource.DEFAULT: "01-ai/Yi-6B", + DownloadSource.MODELSCOPE: "01ai/Yi-6B" + }, + "Yi-34B": { + DownloadSource.DEFAULT: "01-ai/Yi-34B", + DownloadSource.MODELSCOPE: "01ai/Yi-34B" + }, + "Yi-34B-Chat": { + DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat", + DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat" + }, + "Yi-34B-int8-Chat": { + DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-8bits", + DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-8bits" + } + }, + template="yi" +) + + +register_model_group( + models={ + "Zephyr-7B-Alpha-Chat": { + DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-alpha", + DownloadSource.MODELSCOPE: "AI-ModelScope/zephyr-7b-alpha" + }, + "Zephyr-7B-Beta-Chat": { + DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta", + DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta" + } + }, + template="zephyr" +) diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/logging.py b/LLM-Detector-V4-11w/src/llmtuner/extras/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..d01c14a48b9cf223864067edee587aa5d0f854f9 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/logging.py @@ -0,0 +1,49 @@ +import sys +import logging + + +class LoggerHandler(logging.Handler): + r""" + Logger handler used in Web UI. + """ + + def __init__(self): + super().__init__() + self.log = "" + + def reset(self): + self.log = "" + + def emit(self, record): + if record.name == "httpx": + return + log_entry = self.format(record) + self.log += log_entry + self.log += "\n\n" + + +def get_logger(name: str) -> logging.Logger: + r""" + Gets a standard logger with a stream hander to stdout. + """ + formatter = logging.Formatter( + fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S" + ) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + logger.addHandler(handler) + + return logger + + +def reset_logging() -> None: + r""" + Removes basic config of root logger. (unused in script) + """ + root = logging.getLogger() + list(map(root.removeHandler, root.handlers)) + list(map(root.removeFilter, root.filters)) diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/misc.py b/LLM-Detector-V4-11w/src/llmtuner/extras/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..e1ae7d9f90eaff0b57e3180fac84a74a0ce2d355 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/misc.py @@ -0,0 +1,140 @@ +import gc +import os +import sys +import torch +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList + +try: + from transformers.utils import ( + is_torch_bf16_cpu_available, + is_torch_bf16_gpu_available, + is_torch_cuda_available, + is_torch_npu_available + ) + _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available() + _is_bf16_available = is_torch_bf16_gpu_available() or is_torch_bf16_cpu_available() +except ImportError: + _is_fp16_available = torch.cuda.is_available() + try: + _is_bf16_available = torch.cuda.is_bf16_supported() + except: + _is_bf16_available = False + +if TYPE_CHECKING: + from transformers import HfArgumentParser + from llmtuner.hparams import ModelArguments + + +class AverageMeter: + r""" + Computes and stores the average and current value. + """ + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: + r""" + Returns the number of trainable parameters and number of all parameters in the model. + """ + trainable_params, all_param = 0, 0 + for param in model.parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by 2 + if param.__class__.__name__ == "Params4bit": + num_params = num_params * 2 + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + +def get_current_device() -> str: + import accelerate + if accelerate.utils.is_xpu_available(): + return "xpu:{}".format(os.environ.get("LOCAL_RANK", "0")) + elif accelerate.utils.is_npu_available() or torch.cuda.is_available(): + return "cuda:{}".format(os.environ.get("LOCAL_RANK", "0")) + else: + return "cpu" + + +def get_logits_processor() -> "LogitsProcessorList": + r""" + Gets logits processor that removes NaN and Inf logits. + """ + logits_processor = LogitsProcessorList() + logits_processor.append(InfNanRemoveLogitsProcessor()) + return logits_processor + + +def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype: + r""" + Infers the optimal dtype according to the model_dtype and device compatibility. + """ + if _is_bf16_available and model_dtype == torch.bfloat16: + return torch.bfloat16 + elif _is_fp16_available: + return torch.float16 + else: + return torch.float32 + + +def parse_args(parser: "HfArgumentParser", args: Optional[Dict[str, Any]] = None) -> Tuple[Any]: + if args is not None: + return parser.parse_dict(args) + elif len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"): + return parser.parse_yaml_file(os.path.abspath(sys.argv[1])) + elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + return parser.parse_json_file(os.path.abspath(sys.argv[1])) + else: + return parser.parse_args_into_dataclasses() + + +def torch_gc() -> None: + r""" + Collects GPU memory. + """ + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + + +def try_download_model_from_ms(model_args: "ModelArguments") -> None: + if not use_modelscope() or os.path.exists(model_args.model_name_or_path): + return + + try: + from modelscope import snapshot_download # type: ignore + revision = "master" if model_args.model_revision == "main" else model_args.model_revision + model_args.model_name_or_path = snapshot_download( + model_args.model_name_or_path, + revision=revision, + cache_dir=model_args.cache_dir + ) + except ImportError: + raise ImportError("Please install modelscope via `pip install modelscope -U`") + + +def use_modelscope() -> bool: + return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0"))) diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/packages.py b/LLM-Detector-V4-11w/src/llmtuner/extras/packages.py new file mode 100644 index 0000000000000000000000000000000000000000..22cab7324a6a000a8835c381df5738e938877b5e --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/packages.py @@ -0,0 +1,55 @@ +import importlib.metadata +import importlib.util + + +def is_package_available(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def get_package_version(name: str) -> str: + try: + return importlib.metadata.version(name) + except: + return "0.0.0" + + +_fastapi_available = is_package_available("fastapi") +_flash_attn2_available = is_package_available("flash_attn") and get_package_version("flash_attn").startswith("2") +_jieba_available = is_package_available("jieba") +_matplotlib_available = is_package_available("matplotlib") +_nltk_available = is_package_available("nltk") +_rouge_available = is_package_available("rouge_chinese") +_starlette_available = is_package_available("sse_starlette") +_uvicorn_available = is_package_available("uvicorn") + + +def is_fastapi_availble(): + return _fastapi_available + + +def is_flash_attn2_available(): + return _flash_attn2_available + + +def is_jieba_available(): + return _jieba_available + + +def is_matplotlib_available(): + return _matplotlib_available + + +def is_nltk_available(): + return _nltk_available + + +def is_rouge_available(): + return _rouge_available + + +def is_starlette_available(): + return _starlette_available + + +def is_uvicorn_available(): + return _uvicorn_available diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/patches/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/extras/patches/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/patches/llama_patch.py b/LLM-Detector-V4-11w/src/llmtuner/extras/patches/llama_patch.py new file mode 100644 index 0000000000000000000000000000000000000000..1fb7ed3b320145031f649c3b8136cb5e64b0a778 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/patches/llama_patch.py @@ -0,0 +1,224 @@ +import math +import torch +import torch.nn as nn +from typing import Optional, Tuple +from transformers.utils import logging +from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb + +try: + from transformers.models.llama.modeling_llama import repeat_kv +except ImportError: + print("Please upgrade `transformers`.") + +from llmtuner.extras.packages import is_flash_attn2_available + + +if is_flash_attn2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func # type: ignore + from flash_attn.bert_padding import pad_input, unpad_input # type: ignore + + +logger = logging.get_logger(__name__) + + +# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +class LlamaShiftShortAttention(LlamaAttention): + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + if getattr(self, "num_key_value_groups"): + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift + groupsz = int(q_len * getattr(self.config, "group_size_ratio")) + assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) + num_groups = q_len // groupsz + def shift(state: torch.Tensor) -> torch.Tensor: + state = state.transpose(1, 2) # output: (bsz, seq_len, n_heads, head_dim) + state = torch.cat(( + state[:, :, :self.num_heads//2], state[:, :, self.num_heads//2:].roll(-groupsz//2, dims=1) + ), dim=2) + return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) + if attention_mask is not None: + attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) # (bsz, :, seq_len, :) or (bsz*n_group, :, groupsz, :) + attn_output = attn_output.transpose(1, 2).contiguous() + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift back + attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) + attn_output = torch.cat(( + attn_output[:, :, :self.num_heads//2], attn_output[:, :, self.num_heads//2:].roll(groupsz//2, dims=1) + )) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaFlashAttention2(LlamaAttention): + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # LlamaFlashAttention2 attention does not support output_attentions + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # cast to half precision + input_dtype = query_states.dtype + if input_dtype == torch.float32: + logger.warning_once("The input hidden states seems to be silently casted in float32.") + query_states = query_states.to(self.config.torch_dtype) + key_states = key_states.to(self.config.torch_dtype) + value_states = value_states.to(self.config.torch_dtype) + + if getattr(self, "num_key_value_groups", None): + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + query_states = query_states.transpose(1, 2) # (bsz, seq_len, n_heads, head_dim) + key_states = key_states.transpose(1, 2) # (bsz, seq_len, n_heads, head_dim) + value_states = value_states.transpose(1, 2) # (bsz, seq_len, n_heads, head_dim) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift + groupsz = int(q_len * getattr(self.config, "group_size_ratio")) + assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) + num_groups = q_len // groupsz + def shift(state: torch.Tensor) -> torch.Tensor: + state = torch.cat(( + state[:, :, :self.num_heads//2], state[:, :, self.num_heads//2:].roll(-groupsz//2, dims=1) + ), dim=2) + return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim) + + query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) + if attention_mask is not None: + attention_mask = attention_mask.reshape(bsz * num_groups, groupsz) + + if attention_mask is not None: + logger.warning_once("Padded sequences are less efficient in FlashAttention.") + # -q_len: assumes left padding when q_len != kv_len + unpadded_q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(query_states, attention_mask[:, -q_len:]) + unpadded_k, _, cu_seqlens_k, max_seqlen_k = unpad_input(key_states, attention_mask) + unpadded_v, _, _, _ = unpad_input(value_states, attention_mask) + attn_output_unpad = flash_attn_varlen_func( + unpadded_q, + unpadded_k, + unpadded_v, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + dropout_p=0.0, + softmax_scale=None, + causal=True, + ) + attn_output = pad_input(attn_output_unpad, indices_q, bsz, q_len) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, 0.0, softmax_scale=None, causal=True + ) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift back + attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) + attn_output = torch.cat(( + attn_output[:, :, :self.num_heads//2], attn_output[:, :, self.num_heads//2:].roll(groupsz//2, dims=1) + )) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Disable the transformation of the attention mask in LlamaModel as flash attention +# takes a boolean padding_mask. Fills in the past kv length for use in forward. +def _prepare_decoder_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: torch.Tensor, + inputs_embeds: torch.Tensor, + past_key_values_length: int +) -> torch.Tensor: + if attention_mask is not None and torch.all(attention_mask): + return None # This uses the faster call when training with full samples + + return attention_mask diff --git a/LLM-Detector-V4-11w/src/llmtuner/extras/ploting.py b/LLM-Detector-V4-11w/src/llmtuner/extras/ploting.py new file mode 100644 index 0000000000000000000000000000000000000000..cf2c72acd4315ccbac6ef31527dce71d791ae319 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/extras/ploting.py @@ -0,0 +1,55 @@ +import os +import math +import json +from typing import List, Optional +from transformers.trainer import TRAINER_STATE_NAME + +from llmtuner.extras.logging import get_logger +from llmtuner.extras.packages import is_matplotlib_available + +if is_matplotlib_available(): + import matplotlib.pyplot as plt + + +logger = get_logger(__name__) + + +def smooth(scalars: List[float]) -> List[float]: + r""" + EMA implementation according to TensorBoard. + """ + last = scalars[0] + smoothed = list() + weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5) # a sigmoid function + for next_val in scalars: + smoothed_val = last * weight + (1 - weight) * next_val + smoothed.append(smoothed_val) + last = smoothed_val + return smoothed + + +def plot_loss(save_dictionary: os.PathLike, keys: Optional[List[str]] = ["loss"]) -> None: + + with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f: + data = json.load(f) + + for key in keys: + steps, metrics = [], [] + for i in range(len(data["log_history"])): + if key in data["log_history"][i]: + steps.append(data["log_history"][i]["step"]) + metrics.append(data["log_history"][i][key]) + + if len(metrics) == 0: + logger.warning(f"No metric {key} to plot.") + continue + + plt.figure() + plt.plot(steps, metrics, alpha=0.4, label="original") + plt.plot(steps, smooth(metrics), label="smoothed") + plt.title("training {} of {}".format(key, save_dictionary)) + plt.xlabel("step") + plt.ylabel(key) + plt.legend() + plt.savefig(os.path.join(save_dictionary, "training_{}.png".format(key)), format="png", dpi=100) + print("Figure saved:", os.path.join(save_dictionary, "training_{}.png".format(key))) diff --git a/LLM-Detector-V4-11w/src/llmtuner/hparams/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/hparams/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..623d65176d44d55cebff76c15e8ebff20d6758e8 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/hparams/__init__.py @@ -0,0 +1,5 @@ +from .data_args import DataArguments +from .evaluation_args import EvaluationArguments +from .finetuning_args import FinetuningArguments +from .generating_args import GeneratingArguments +from .model_args import ModelArguments diff --git a/LLM-Detector-V4-11w/src/llmtuner/hparams/data_args.py b/LLM-Detector-V4-11w/src/llmtuner/hparams/data_args.py new file mode 100644 index 0000000000000000000000000000000000000000..cea89198ae02fa1d33a0a90004a14a630501d692 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/hparams/data_args.py @@ -0,0 +1,179 @@ +import os +import json +from typing import List, Literal, Optional +from dataclasses import dataclass, field + + +DATA_CONFIG = "dataset_info.json" + + +@dataclass +class DatasetAttr: + + load_from: str + dataset_name: Optional[str] = None + dataset_sha1: Optional[str] = None + system_prompt: Optional[str] = None + subset: Optional[str] = None + ranking: Optional[bool] = False + formatting: Optional[Literal["alpaca", "sharegpt"]] = "alpaca" + + prompt: Optional[str] = "instruction" + query: Optional[str] = "input" + response: Optional[str] = "output" + history: Optional[str] = None + messages: Optional[str] = "conversations" + role: Optional[str] = "from" + content: Optional[str] = "value" + + def __repr__(self) -> str: + return self.dataset_name + + +@dataclass +class DataArguments: + r""" + Arguments pertaining to what data we are going to input our model for training and evaluation. + """ + template: Optional[str] = field( + default=None, + metadata={"help": "Which template to use for constructing prompts in training and inference."} + ) + dataset: Optional[str] = field( + default=None, + metadata={"help": "The name of provided dataset(s) to use. Use commas to separate multiple datasets."} + ) + dataset_dir: Optional[str] = field( + default="data", + metadata={"help": "Path to the folder containing the datasets."} + ) + split: Optional[str] = field( + default="train", + metadata={"help": "Which dataset split to use for training and evaluation."} + ) + cutoff_len: Optional[int] = field( + default=1024, + metadata={"help": "The maximum length of the model inputs after tokenization."} + ) + reserved_label_len: Optional[int] = field( + default=1, + metadata={"help": "The maximum length reserved for label after tokenization."} + ) + train_on_prompt: Optional[bool] = field( + default=False, + metadata={"help": "Whether to disable the mask on the prompt or not."} + ) + streaming: Optional[bool] = field( + default=False, + metadata={"help": "Enable dataset streaming."} + ) + buffer_size: Optional[int] = field( + default=16384, + metadata={"help": "Size of the buffer to randomly sample examples from in dataset streaming."} + ) + mix_strategy: Optional[Literal["concat", "interleave_under", "interleave_over"]] = field( + default="concat", + metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling)."} + ) + interleave_probs: Optional[str] = field( + default=None, + metadata={"help": "Probabilities to sample data from datasets. Use commas to separate multiple datasets."} + ) + overwrite_cache: Optional[bool] = field( + default=False, + metadata={"help": "Overwrite the cached training and evaluation sets."} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."} + ) + max_samples: Optional[int] = field( + default=None, + metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."} + ) + eval_num_beams: Optional[int] = field( + default=None, + metadata={"help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`"} + ) + ignore_pad_token_for_loss: Optional[bool] = field( + default=True, + metadata={"help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."} + ) + system_prompt: Optional[str] = field( + default=None, + metadata={"help": "System prompt to add before the user query. Use `|` to separate multiple prompts in training."} + ) + val_size: Optional[float] = field( + default=0, + metadata={"help": "Size of the development set, should be an integer or a float in range `[0,1)`."} + ) + sft_packing: Optional[bool] = field( + default=False, + metadata={"help": "Packing the questions and answers in the supervised fine-tuning stage."} + ) + cache_path: Optional[str] = field( + default=None, + metadata={"help": "Path to save or load the preprocessed datasets."} + ) + + def __post_init__(self): + if self.reserved_label_len >= self.cutoff_len: + raise ValueError("`reserved_label_len` must be smaller than `cutoff_len`.") + + if self.streaming and self.val_size > 1e-6 and self.val_size < 1: + raise ValueError("Streaming mode should have an integer val size.") + + if self.streaming and self.max_samples is not None: + raise ValueError("`max_samples` is incompatible with `streaming`.") + + if self.streaming and self.cache_path: + raise ValueError("`cache_path` is incompatible with `streaming`.") + + def init_for_training(self, seed: int): # support mixing multiple datasets + self.seed = seed + dataset_names = [ds.strip() for ds in self.dataset.split(",")] if self.dataset is not None else [] + try: + with open(os.path.join(self.dataset_dir, DATA_CONFIG), "r") as f: + dataset_info = json.load(f) + except Exception as err: + if self.dataset is not None: + raise ValueError("Cannot open {} due to {}.".format(os.path.join(self.dataset_dir, DATA_CONFIG), str(err))) + dataset_info = None + + prompt_list = self.system_prompt.split("|") if self.system_prompt else [None] + prompt_list = prompt_list * (len(dataset_names) // len(prompt_list)) + assert len(prompt_list) == len(dataset_names), "Number of system prompts should be equal to datasets or 1." + + if self.interleave_probs is not None: + self.interleave_probs = [float(prob.strip()) for prob in self.interleave_probs.split(",")] + + self.dataset_list: List[DatasetAttr] = [] + for i, name in enumerate(dataset_names): + if name not in dataset_info: + raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) + + if "hf_hub_url" in dataset_info[name]: + dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"]) + elif "script_url" in dataset_info[name]: + dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"]) + else: + dataset_attr = DatasetAttr( + "file", + dataset_name=dataset_info[name]["file_name"], + dataset_sha1=dataset_info[name].get("file_sha1", None) + ) + + if "columns" in dataset_info[name]: + dataset_attr.prompt = dataset_info[name]["columns"].get("prompt", None) + dataset_attr.query = dataset_info[name]["columns"].get("query", None) + dataset_attr.response = dataset_info[name]["columns"].get("response", None) + dataset_attr.history = dataset_info[name]["columns"].get("history", None) + dataset_attr.messages = dataset_info[name]["columns"].get("messages", None) + dataset_attr.role = dataset_info[name]["columns"].get("role", None) + dataset_attr.content = dataset_info[name]["columns"].get("content", None) + + dataset_attr.subset = dataset_info[name].get("subset", None) + dataset_attr.ranking = dataset_info[name].get("ranking", False) + dataset_attr.formatting = dataset_info[name].get("formatting", "alpaca") + dataset_attr.system_prompt = prompt_list[i] + self.dataset_list.append(dataset_attr) diff --git a/LLM-Detector-V4-11w/src/llmtuner/hparams/evaluation_args.py b/LLM-Detector-V4-11w/src/llmtuner/hparams/evaluation_args.py new file mode 100644 index 0000000000000000000000000000000000000000..5f507698bd6bccbde81f60801ed36657c01c9e40 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/hparams/evaluation_args.py @@ -0,0 +1,55 @@ +import os +from typing import Literal, Optional +from dataclasses import dataclass, field + +from datasets import DownloadMode + + +@dataclass +class EvaluationArguments: + r""" + Arguments pertaining to specify the evaluation parameters. + """ + task: str = field( + metadata={"help": "Name of the evaluation task."} + ) + task_dir: Optional[str] = field( + default="evaluation", + metadata={"help": "Path to the folder containing the evaluation datasets."} + ) + batch_size: Optional[int] = field( + default=4, + metadata={"help": "The batch size per GPU for evaluation."} + ) + seed: Optional[int] = field( + default=42, + metadata={"help": "Random seed to be used with data loaders."} + ) + lang: Optional[Literal["en", "zh"]] = field( + default="en", + metadata={"help": "Language used at evaluation."} + ) + n_shot: Optional[int] = field( + default=5, + metadata={"help": "Number of examplars for few-shot learning."} + ) + save_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to save the evaluation results."} + ) + download_mode: Optional[DownloadMode] = field( + default=DownloadMode.REUSE_DATASET_IF_EXISTS, + metadata={"help": "Download mode used for the evaluation datasets."} + ) + + def __post_init__(self): + task_available = [] + for folder in os.listdir(self.task_dir): + if os.path.isdir(os.path.join(self.task_dir, folder)): + task_available.append(folder) + + if self.task not in task_available: + raise ValueError("Task {} not found in {}.".format(self.task, self.task_dir)) + + if self.save_dir is not None and os.path.exists(self.save_dir): + raise ValueError("`save_dir` already exists, use another one.") diff --git a/LLM-Detector-V4-11w/src/llmtuner/hparams/finetuning_args.py b/LLM-Detector-V4-11w/src/llmtuner/hparams/finetuning_args.py new file mode 100644 index 0000000000000000000000000000000000000000..cf60676a0b982939a709f311bb3733e00457264e --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/hparams/finetuning_args.py @@ -0,0 +1,196 @@ +import json +from typing import Literal, Optional +from dataclasses import asdict, dataclass, field + + +@dataclass +class FreezeArguments: + r""" + Arguments pertaining to the freeze (partial-parameter) training. + """ + name_module_trainable: Optional[str] = field( + default="mlp", + metadata={"help": "Name of trainable modules for partial-parameter (freeze) fine-tuning. \ + Use commas to separate multiple modules. \ + LLaMA choices: [\"mlp\", \"self_attn\"], \ + BLOOM & Falcon & ChatGLM choices: [\"mlp\", \"self_attention\"], \ + Qwen choices: [\"mlp\", \"attn\"], \ + Phi-1.5 choices: [\"mlp\", \"mixer\"], \ + Others choices: the same as LLaMA."} + ) + num_layer_trainable: Optional[int] = field( + default=3, + metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."} + ) + + +@dataclass +class LoraArguments: + r""" + Arguments pertaining to the LoRA training. + """ + additional_target: Optional[str] = field( + default=None, + metadata={"help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint."} + ) + lora_alpha: Optional[float] = field( + default=None, + metadata={"help": "The scale factor for LoRA fine-tuning (default: lora_rank * 2.0)."} + ) + lora_dropout: Optional[float] = field( + default=0.1, + metadata={"help": "Dropout rate for the LoRA fine-tuning."} + ) + lora_rank: Optional[int] = field( + default=8, + metadata={"help": "The intrinsic dimension for LoRA fine-tuning."} + ) + lora_target: Optional[str] = field( + default=None, + metadata={"help": "Name(s) of target modules to apply LoRA. Use commas to separate multiple modules. \ + LLaMA choices: [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], \ + BLOOM & Falcon & ChatGLM choices: [\"query_key_value\", \"dense\", \"dense_h_to_4h\", \"dense_4h_to_h\"], \ + Baichuan choices: [\"W_pack\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], \ + Qwen choices: [\"c_attn\", \"attn.c_proj\", \"w1\", \"w2\", \"mlp.c_proj\"], \ + Phi-1.5 choices: [\"Wqkv\", \"out_proj\", \"fc1\", \"fc2\"], \ + Others choices: the same as LLaMA."} + ) + resume_lora_training: Optional[bool] = field( + default=True, + metadata={"help": "Whether to resume training from the last LoRA weights or create new weights after merging them."} + ) + + +@dataclass +class RLHFArguments: + r""" + Arguments pertaining to the PPO and DPO training. + """ + dpo_beta: Optional[float] = field( + default=0.1, + metadata={"help": "The beta parameter for the DPO loss."} + ) + ppo_buffer_size: Optional[int] = field( + default=1, + metadata={"help": "The number of mini-batches to make experience buffer in a PPO optimization step."} + ) + ppo_epochs: Optional[int] = field( + default=4, + metadata={"help": "The number of epochs to perform in a PPO optimization step."} + ) + ppo_logger: Optional[str] = field( + default=None, + metadata={"help": "Log with either \"wandb\" or \"tensorboard\" in PPO training."} + ) + ppo_score_norm: Optional[bool] = field( + default=False, + metadata={"help": "Use score normalization in PPO training."} + ) + ppo_target: Optional[float] = field( + default=6.0, + metadata={"help": "Target KL value for adaptive KL control in PPO training."} + ) + ppo_whiten_rewards: Optional[bool] = field( + default=False, + metadata={"help": "Whiten the rewards before compute advantages in PPO training."} + ) + ref_model: Optional[str] = field( + default=None, + metadata={"help": "Path to the reference model used for the PPO or DPO training."} + ) + ref_model_checkpoint: Optional[str] = field( + default=None, + metadata={"help": "Path to the directory(s) containing the model checkpoints of the reference model."} + ) + ref_model_quantization_bit: Optional[int] = field( + default=None, + metadata={"help": "The number of bits to quantize the reference model."} + ) + reward_model: Optional[str] = field( + default=None, + metadata={"help": "Path to the directory containing the checkpoints of the reward model."} + ) + reward_model_checkpoint: Optional[str] = field( + default=None, + metadata={"help": "Path to the directory(s) containing the model checkpoints of the reward model."} + ) + reward_model_quantization_bit: Optional[int] = field( + default=None, + metadata={"help": "The number of bits to quantize the reward model."} + ) + reward_model_type: Optional[Literal["lora", "full"]] = field( + default="lora", + metadata={"help": "The checkpoint type of the reward model. The lora type only supports lora training."} + ) + + +@dataclass +class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments): + r""" + Arguments pertaining to which techniques we are going to fine-tuning with. + """ + stage: Optional[Literal["pt", "sft", "rm", "ppo", "dpo"]] = field( + default="sft", + metadata={"help": "Which stage will be performed in training."} + ) + finetuning_type: Optional[Literal["lora", "freeze", "full"]] = field( + default="lora", + metadata={"help": "Which fine-tuning method to use."} + ) + upcast_layernorm: Optional[bool] = field( + default=False, + metadata={"help": "Whether to upcast the layernorm weights in fp32."} + ) + neft_alpha: Optional[float] = field( + default=0, + metadata={"help": "The alpha parameter to control the noise magnitude in NEFTune."} + ) + export_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to the directory to save the exported model."} + ) + export_size: Optional[int] = field( + default=1, + metadata={"help": "The file shard size (in GB) of the exported model."} + ) + plot_loss: Optional[bool] = field( + default=False, + metadata={"help": "Whether to plot the training loss after fine-tuning or not."} + ) + + def __post_init__(self): + def split_arg(arg): + if isinstance(arg, str): + return [item.strip() for item in arg.split(",")] + return arg + + self.name_module_trainable = split_arg(self.name_module_trainable) + self.lora_alpha = self.lora_alpha or float(self.lora_rank * 2.0) + self.lora_target = split_arg(self.lora_target) + self.additional_target = split_arg(self.additional_target) + self.ref_model_checkpoint = split_arg(self.ref_model_checkpoint) + self.reward_model_checkpoint = split_arg(self.reward_model_checkpoint) + + assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method." + assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." + assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." + + if self.stage == "ppo" and self.reward_model is None: + raise ValueError("Reward model is necessary for PPO training.") + + if self.stage == "ppo" and self.reward_model_type == "lora" and self.finetuning_type != "lora": + raise ValueError("Freeze/Full PPO training needs `reward_model_type=full`.") + + def save_to_json(self, json_path: str): + r"""Saves the content of this instance in JSON format inside `json_path`.""" + json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n" + with open(json_path, "w", encoding="utf-8") as f: + f.write(json_string) + + @classmethod + def load_from_json(cls, json_path: str): + r"""Creates an instance from the content of `json_path`.""" + with open(json_path, "r", encoding="utf-8") as f: + text = f.read() + + return cls(**json.loads(text)) diff --git a/LLM-Detector-V4-11w/src/llmtuner/hparams/generating_args.py b/LLM-Detector-V4-11w/src/llmtuner/hparams/generating_args.py new file mode 100644 index 0000000000000000000000000000000000000000..c04a5c36964498dcbd5a43931e3fe858ed95b53f --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/hparams/generating_args.py @@ -0,0 +1,53 @@ +from typing import Any, Dict, Optional +from dataclasses import asdict, dataclass, field + + +@dataclass +class GeneratingArguments: + r""" + Arguments pertaining to specify the decoding parameters. + """ + do_sample: Optional[bool] = field( + default=True, + metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."} + ) + temperature: Optional[float] = field( + default=0.95, + metadata={"help": "The value used to modulate the next token probabilities."} + ) + top_p: Optional[float] = field( + default=0.7, + metadata={"help": "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."} + ) + top_k: Optional[int] = field( + default=50, + metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."} + ) + num_beams: Optional[int] = field( + default=1, + metadata={"help": "Number of beams for beam search. 1 means no beam search."} + ) + max_length: Optional[int] = field( + default=512, + metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."} + ) + max_new_tokens: Optional[int] = field( + default=512, + metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."} + ) + repetition_penalty: Optional[float] = field( + default=1.0, + metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."} + ) + length_penalty: Optional[float] = field( + default=1.0, + metadata={"help": "Exponential penalty to the length that is used with beam-based generation."} + ) + + def to_dict(self) -> Dict[str, Any]: + args = asdict(self) + if args.get("max_new_tokens", -1) > 0: + args.pop("max_length", None) + else: + args.pop("max_new_tokens", None) + return args diff --git a/LLM-Detector-V4-11w/src/llmtuner/hparams/model_args.py b/LLM-Detector-V4-11w/src/llmtuner/hparams/model_args.py new file mode 100644 index 0000000000000000000000000000000000000000..07903b37b2406fc7dccbcef745c6254e850b721d --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/hparams/model_args.py @@ -0,0 +1,76 @@ +from typing import Any, Dict, Literal, Optional +from dataclasses import asdict, dataclass, field + + +@dataclass +class ModelArguments: + r""" + Arguments pertaining to which model/config/tokenizer we are going to fine-tune. + """ + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from \ + huggingface.co/models or modelscope.cn/models."} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where to store the pretrained models downloaded from huggingface.co."} + ) + use_fast_tokenizer: Optional[bool] = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."} + ) + split_special_tokens: Optional[bool] = field( + default=False, + metadata={"help": "Whether or not the special tokens should be split during the tokenization process."} + ) + model_revision: Optional[str] = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."} + ) + quantization_bit: Optional[int] = field( + default=None, + metadata={"help": "The number of bits to quantize the model."} + ) + quantization_type: Optional[Literal["fp4", "nf4"]] = field( + default="nf4", + metadata={"help": "Quantization data type to use in int4 training."} + ) + double_quantization: Optional[bool] = field( + default=True, + metadata={"help": "Whether to use double quantization in int4 training or not."} + ) + rope_scaling: Optional[Literal["linear", "dynamic"]] = field( + default=None, + metadata={"help": "Adopt scaled rotary positional embeddings."} + ) + checkpoint_dir: Optional[str] = field( + default=None, + metadata={"help": "Path to the directory(s) containing the model checkpoints as well as the configurations."} + ) + flash_attn: Optional[bool] = field( + default=False, + metadata={"help": "Enable FlashAttention-2 for faster training."} + ) + shift_attn: Optional[bool] = field( + default=False, + metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."} + ) + hf_hub_token: Optional[str] = field( + default=None, + metadata={"help": "Auth token to log in with Hugging Face Hub."} + ) + + def __post_init__(self): + self.compute_dtype = None + self.model_max_length = None + + if self.split_special_tokens and self.use_fast_tokenizer: + raise ValueError("`split_special_tokens` is only supported for slow tokenizers.") + + if self.checkpoint_dir is not None: # support merging multiple lora weights + self.checkpoint_dir = [cd.strip() for cd in self.checkpoint_dir.split(",")] + + assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) diff --git a/LLM-Detector-V4-11w/src/llmtuner/model/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f12acb5818c591a15be15d6174002c993d86e067 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/model/__init__.py @@ -0,0 +1,5 @@ +# Level: loader > adapter > parser, utils + +from llmtuner.model.loader import load_model_and_tokenizer +from llmtuner.model.parser import get_train_args, get_infer_args, get_eval_args +from llmtuner.model.utils import dispatch_model, get_modelcard_args, load_valuehead_params diff --git a/LLM-Detector-V4-11w/src/llmtuner/model/adapter.py b/LLM-Detector-V4-11w/src/llmtuner/model/adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..9a3fb3f6189f7e8e3f53ccd17a1227fe4d3b6f07 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/model/adapter.py @@ -0,0 +1,108 @@ +import torch +from typing import TYPE_CHECKING +from peft import PeftModel, TaskType, LoraConfig, get_peft_model + +from llmtuner.extras.logging import get_logger +from llmtuner.model.utils import find_all_linear_modules + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from llmtuner.hparams import ModelArguments, FinetuningArguments + + +logger = get_logger(__name__) + + +def init_adapter( + model: "PreTrainedModel", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool +) -> "PreTrainedModel": + r""" + Initializes the adapters. + + Support full-parameter, freeze and LoRA training. + + Note that the trainable parameters must be cast to float32. + """ + + if (not is_trainable) and model_args.checkpoint_dir is None: + logger.info("Checkpoint is not found at evaluation, load the original model.") + return model + + if finetuning_args.finetuning_type == "full" and is_trainable: + logger.info("Fine-tuning method: Full") + model = model.float() + + if finetuning_args.finetuning_type == "freeze" and is_trainable: + logger.info("Fine-tuning method: Freeze") + num_layers = ( + getattr(model.config, "num_hidden_layers", None) + or getattr(model.config, "num_layers", None) + or getattr(model.config, "n_layer", None) + ) + if not num_layers: + raise ValueError("Current model does not support freeze tuning.") + if finetuning_args.num_layer_trainable > 0: # fine-tuning the last n layers if num_layer_trainable > 0 + trainable_layer_ids = [num_layers - k - 1 for k in range(finetuning_args.num_layer_trainable)] + else: # fine-tuning the first n layers if num_layer_trainable < 0 + trainable_layer_ids = [k for k in range(-finetuning_args.num_layer_trainable)] + + trainable_layers = [] + for module_name in finetuning_args.name_module_trainable: + for idx in trainable_layer_ids: + trainable_layers.append("{:d}.{}".format(idx, module_name)) + + for name, param in model.named_parameters(): + if not any(trainable_layer in name for trainable_layer in trainable_layers): + param.requires_grad_(False) + else: + param.data = param.data.to(torch.float32) + + if finetuning_args.finetuning_type == "lora": + logger.info("Fine-tuning method: LoRA") + checkpoint_to_resume = None + + if model_args.checkpoint_dir is not None: + is_mergeable = True + if getattr(model, "quantization_method", None) == "gptq": + assert len(model_args.checkpoint_dir) == 1, "GPTQ quantized model only accepts a single checkpoint." + is_mergeable = False + + if (is_trainable and finetuning_args.resume_lora_training) or (not is_mergeable): + checkpoints_to_merge, checkpoint_to_resume = model_args.checkpoint_dir[:-1], model_args.checkpoint_dir[-1] + else: + checkpoints_to_merge = model_args.checkpoint_dir + + for checkpoint in checkpoints_to_merge: + model = PeftModel.from_pretrained(model, checkpoint) + model = model.merge_and_unload() + + if len(checkpoints_to_merge) > 0: + logger.info("Merged {} model checkpoint(s).".format(len(checkpoints_to_merge))) + + if checkpoint_to_resume is not None: # resume lora training + model = PeftModel.from_pretrained(model, checkpoint_to_resume, is_trainable=is_trainable) + + if is_trainable and checkpoint_to_resume is None: # create new lora weights while training + if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": + target_modules = find_all_linear_modules(model, model_args.quantization_bit) + else: + target_modules = finetuning_args.lora_target + + lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + inference_mode=False, + r=finetuning_args.lora_rank, + lora_alpha=finetuning_args.lora_alpha, + lora_dropout=finetuning_args.lora_dropout, + target_modules=target_modules, + modules_to_save=finetuning_args.additional_target + ) + model = get_peft_model(model, lora_config) + + if model_args.checkpoint_dir is not None: + logger.info("Loaded fine-tuned model from checkpoint(s): {}".format(",".join(model_args.checkpoint_dir))) + + return model diff --git a/LLM-Detector-V4-11w/src/llmtuner/model/loader.py b/LLM-Detector-V4-11w/src/llmtuner/model/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..530869d5b2b4bca8042102a7c9efb948fc94205c --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/model/loader.py @@ -0,0 +1,235 @@ +import math +import torch +from types import MethodType +from typing import TYPE_CHECKING, Literal, Optional, Tuple + +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + PretrainedConfig, + PreTrainedModel, + PreTrainedTokenizerBase +) +from transformers.models.llama import modeling_llama as LlamaModule +from transformers.utils.versions import require_version +from trl import AutoModelForCausalLMWithValueHead + +try: + from transformers.integrations import is_deepspeed_zero3_enabled +except ImportError: # https://github.com/huggingface/transformers/releases/tag/v4.33.1 + from transformers.deepspeed import is_deepspeed_zero3_enabled + +from llmtuner.extras.logging import get_logger +from llmtuner.extras.misc import count_parameters, get_current_device, infer_optim_dtype, try_download_model_from_ms +from llmtuner.extras.packages import is_flash_attn2_available +from llmtuner.extras.patches import llama_patch as LlamaPatches +from llmtuner.hparams import FinetuningArguments +from llmtuner.model.adapter import init_adapter +from llmtuner.model.utils import load_valuehead_params, prepare_model_for_training + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + from llmtuner.hparams import ModelArguments + + +logger = get_logger(__name__) + + +require_version("transformers>=4.31.0,<4.35.0", "To fix: pip install \"transformers>=4.31.0,<4.35.0\"") +require_version("datasets>=2.14.0", "To fix: pip install datasets>=2.14.0") +require_version("accelerate>=0.21.0", "To fix: pip install accelerate>=0.21.0") +require_version("peft>=0.6.0", "To fix: pip install peft>=0.6.0") +require_version("trl>=0.7.4", "To fix: pip install trl>=0.7.4") + + +def load_model_and_tokenizer( + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: Optional[bool] = False, + stage: Optional[Literal["pt", "sft", "rm", "ppo"]] = "sft" +) -> Tuple[PreTrainedModel, "PreTrainedTokenizer"]: + r""" + Loads pretrained model and tokenizer. + + Support both training and inference. + """ + + try_download_model_from_ms(model_args) + + config_kwargs = { + "trust_remote_code": True, + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "token": model_args.hf_hub_token + } + + tokenizer = AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + use_fast=model_args.use_fast_tokenizer, + split_special_tokens=model_args.split_special_tokens, + padding_side="right", # training with left-padded tensors in fp16 precision may cause overflow + **config_kwargs + ) + + if finetuning_args.finetuning_type != "lora" and model_args.checkpoint_dir is not None: + logger.info("Use `model_name_or_path` to specify the model trained with full/freeze method.") + model_to_load = model_args.checkpoint_dir[0] + else: + model_to_load = model_args.model_name_or_path + + config = AutoConfig.from_pretrained(model_to_load, **config_kwargs) + + # Fix tokenizer (for ChatGLM2 and ChatGLM3) + if getattr(config, "model_type", None) == "chatglm": + tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer) + + # Set model dtype + if model_args.compute_dtype is None: # priority: bf16 > fp16 > fp32 + model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None)) + setattr(config, "torch_dtype", model_args.compute_dtype) + + # Fix config (for Qwen) + if getattr(config, "model_type", None) == "qwen": + for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]: + setattr(config, dtype_name, getattr(config, "torch_dtype", None) == dtype) + + # Set RoPE scaling + if model_args.rope_scaling is not None: + if not hasattr(config, "rope_scaling"): + logger.warning("Current model does not support RoPE scaling.") + else: + if is_trainable: + if model_args.rope_scaling == "dynamic": + logger.warning( + "Dynamic NTK may not work well with fine-tuning. " + "See: https://github.com/huggingface/transformers/pull/24653" + ) + + current_max_length = getattr(config, "max_position_embeddings", None) + if current_max_length and model_args.model_max_length > current_max_length: + scaling_factor = float(math.ceil(model_args.model_max_length / current_max_length)) + else: + logger.warning("Input length is smaller than max length. Consider increase input length.") + scaling_factor = 1.0 + else: + scaling_factor = 2.0 + + setattr(config, "rope_scaling", {"type": model_args.rope_scaling, "factor": scaling_factor}) + logger.info("Using {} scaling strategy and setting scaling factor to {}".format( + model_args.rope_scaling, scaling_factor + )) + + # Set FlashAttention-2 + if model_args.flash_attn: + if getattr(config, "model_type", None) == "llama": + if is_flash_attn2_available(): + LlamaModule.LlamaAttention = LlamaPatches.LlamaFlashAttention2 + LlamaModule.LlamaModel._prepare_decoder_attention_mask = LlamaPatches._prepare_decoder_attention_mask + logger.info("Using FlashAttention-2 for faster training and inference.") + else: + logger.warning("FlashAttention-2 is not installed.") + elif getattr(config, "model_type", None) in ["qwen", "Yi"]: + logger.info("Current model automatically enables FlashAttention if installed.") + else: + logger.warning("Current model does not support FlashAttention.") + elif is_trainable and model_args.shift_attn and getattr(config, "model_type", None) == "llama": + LlamaModule.LlamaAttention = LlamaPatches.LlamaShiftShortAttention + logger.warning("Using `--flash_attn` for faster training in large context length.") + + # Set shift short attention (S^2-Attn) + if is_trainable and model_args.shift_attn: + if getattr(config, "model_type", None) == "llama": + setattr(config, "group_size_ratio", 0.25) + logger.info("Using shift short attention with group_size_ratio=1/4.") + else: + logger.warning("Current model does not support shift short attention.") + + # Quantization configurations (using gptq or awq) + if getattr(config, "quantization_config", None): + if model_args.quantization_bit is not None: # remove bnb quantization + model_args.quantization_bit = None + config_kwargs["device_map"] = {"": get_current_device()} + quantization_config = getattr(config, "quantization_config", None) + logger.info("Loading {}-bit quantized model.".format(quantization_config.get("bits", -1))) + + # Quantization configurations (using bitsandbytes library) + if model_args.quantization_bit is not None: + if is_deepspeed_zero3_enabled(): + raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantization.") + + if model_args.quantization_bit == 8: + require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0") + config_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + if model_args.quantization_bit == 4: + require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0") + config_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=model_args.compute_dtype, + bnb_4bit_use_double_quant=model_args.double_quantization, + bnb_4bit_quant_type=model_args.quantization_type + ) + + config_kwargs["device_map"] = {"": get_current_device()} + logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit)) + + # Load pre-trained models (without valuehead) + model = AutoModelForCausalLM.from_pretrained( + model_to_load, + config=config, + torch_dtype=model_args.compute_dtype, + low_cpu_mem_usage=(not is_deepspeed_zero3_enabled()), + **config_kwargs + ) + + # Disable custom generate method (for Qwen and Baichuan2) + if isinstance(model, PreTrainedModel) and "GenerationMixin" not in str(model.generate.__func__): + model.generate = MethodType(PreTrainedModel.generate, model) + + # Fix LM head (for ChatGLM2 and ChatGLM3) + if getattr(config, "model_type", None) == "chatglm": + setattr(model, "lm_head", model.transformer.output_layer) + setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) + + # Register auto class to save the custom code files + if isinstance(config, PretrainedConfig) and "AutoConfig" in getattr(config, "auto_map", {}): + config.__class__.register_for_auto_class() + if isinstance(model, PreTrainedModel) and "AutoModelForCausalLM" in getattr(config, "auto_map", {}): + model.__class__.register_for_auto_class() + if isinstance(tokenizer, PreTrainedTokenizerBase) and "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}): + tokenizer.__class__.register_for_auto_class() + + # Initialize adapters + model = prepare_model_for_training(model=model, finetuning_args=finetuning_args) if is_trainable else model + model = init_adapter(model, model_args, finetuning_args, is_trainable) + model = model.train() if is_trainable else model.eval() + + # Prepare model with valuehead for RLHF + if stage in ["rm", "ppo"]: + model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(model) + setattr(model, "_keys_to_ignore_on_save", [name for name, _ in model.named_parameters() if "pretrained_model" in name]) + setattr(model, "tie_weights", MethodType(lambda _: None, model)) # use empty method + vhead_path = ( + model_args.checkpoint_dir[-1] if model_args.checkpoint_dir is not None else model_args.model_name_or_path + ) + vhead_params = load_valuehead_params(vhead_path, model_args) + if vhead_params is not None: + model.load_state_dict(vhead_params, strict=False) + logger.info("Loaded valuehead from checkpoint: {}".format(vhead_path)) + + # Prepare model for inference + if not is_trainable: + model.requires_grad_(False) # fix all model params + model = model.to(model_args.compute_dtype) if model_args.quantization_bit is None else model + + trainable_params, all_param = count_parameters(model) + logger.info("trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( + trainable_params, all_param, 100 * trainable_params / all_param + )) + + if not is_trainable: + logger.info("This IS expected that the trainable params is 0 if you are using model for inference only.") + + return model, tokenizer diff --git a/LLM-Detector-V4-11w/src/llmtuner/model/parser.py b/LLM-Detector-V4-11w/src/llmtuner/model/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..d298996ee27b5d8e3716ec84e3c6275b9ae9e960 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/model/parser.py @@ -0,0 +1,205 @@ +import os +import torch +import datasets +import transformers +from typing import Any, Dict, Optional, Tuple +from transformers import HfArgumentParser, Seq2SeqTrainingArguments +from transformers.trainer_utils import get_last_checkpoint + +from llmtuner.extras.logging import get_logger +from llmtuner.extras.misc import parse_args +from llmtuner.hparams import ( + ModelArguments, + DataArguments, + EvaluationArguments, + FinetuningArguments, + GeneratingArguments +) + + +logger = get_logger(__name__) + + +_TRAIN_ARGS = [ + ModelArguments, DataArguments, Seq2SeqTrainingArguments, FinetuningArguments, GeneratingArguments +] +_TRAIN_CLS = Tuple[ + ModelArguments, DataArguments, Seq2SeqTrainingArguments, FinetuningArguments, GeneratingArguments +] +_INFER_ARGS = [ + ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments +] +_INFER_CLS = Tuple[ + ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments +] +_EVAL_ARGS = [ + ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments +] +_EVAL_CLS = Tuple[ + ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments +] + + +def _verify_model_args(model_args: "ModelArguments", finetuning_args: "FinetuningArguments") -> None: + if model_args.quantization_bit is not None and finetuning_args.finetuning_type != "lora": + raise ValueError("Quantization is only compatible with the LoRA method.") + + if ( + model_args.checkpoint_dir is not None + and len(model_args.checkpoint_dir) != 1 + and finetuning_args.finetuning_type != "lora" + ): + raise ValueError("Multiple checkpoints are only available for LoRA tuning.") + + +def parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: + parser = HfArgumentParser(_TRAIN_ARGS) + return parse_args(parser, args) + + +def parse_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS: + parser = HfArgumentParser(_INFER_ARGS) + return parse_args(parser, args) + + +def parse_eval_args(args: Optional[Dict[str, Any]] = None) -> _EVAL_CLS: + parser = HfArgumentParser(_EVAL_ARGS) + return parse_args(parser, args) + + +def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: + model_args, data_args, training_args, finetuning_args, generating_args = parse_train_args(args) + + # Setup logging + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + + log_level = training_args.get_process_log_level() + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Check arguments + data_args.init_for_training(training_args.seed) + + if finetuning_args.stage != "pt" and data_args.template is None: + raise ValueError("Please specify which `template` to use.") + + if finetuning_args.stage != "sft" and training_args.predict_with_generate: + raise ValueError("`predict_with_generate` cannot be set as True except SFT.") + + if finetuning_args.stage == "sft" and training_args.do_predict and not training_args.predict_with_generate: + raise ValueError("Please enable `predict_with_generate` to save model predictions.") + + if finetuning_args.stage in ["rm", "ppo"] and training_args.load_best_model_at_end: + raise ValueError("RM and PPO stages do not support `load_best_model_at_end`.") + + if finetuning_args.stage == "ppo" and not training_args.do_train: + raise ValueError("PPO training does not support evaluation, use the SFT stage to evaluate models.") + + if finetuning_args.stage in ["rm", "dpo"] and (not all([data_attr.ranking for data_attr in data_args.dataset_list])): + raise ValueError("Please use ranked datasets for reward modeling or DPO training.") + + if finetuning_args.stage == "ppo" and model_args.shift_attn: + raise ValueError("PPO training is incompatible with S^2-Attn.") + + if training_args.max_steps == -1 and data_args.streaming: + raise ValueError("Please specify `max_steps` in streaming mode.") + + if training_args.do_train and training_args.predict_with_generate: + raise ValueError("`predict_with_generate` cannot be set as True while training.") + + if training_args.do_train and finetuning_args.finetuning_type == "lora" and finetuning_args.lora_target is None: + raise ValueError("Please specify `lora_target` in LoRA training.") + + _verify_model_args(model_args, finetuning_args) + + if training_args.do_train and model_args.quantization_bit is not None and (not finetuning_args.upcast_layernorm): + logger.warning("We recommend enable `upcast_layernorm` in quantized training.") + + if training_args.do_train and (not training_args.fp16) and (not training_args.bf16): + logger.warning("We recommend enable mixed precision training.") + + if (not training_args.do_train) and model_args.quantization_bit is not None: + logger.warning("Evaluating model in 4/8-bit mode may cause lower scores.") + + if (not training_args.do_train) and finetuning_args.stage == "dpo" and finetuning_args.ref_model is None: + logger.warning("Specify `ref_model` for computing rewards at evaluation.") + + # postprocess training_args + if ( + training_args.local_rank != -1 + and training_args.ddp_find_unused_parameters is None + and finetuning_args.finetuning_type == "lora" + ): + logger.warning("`ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.") + training_args_dict = training_args.to_dict() + training_args_dict.update(dict(ddp_find_unused_parameters=False)) + training_args = Seq2SeqTrainingArguments(**training_args_dict) + + if ( + training_args.resume_from_checkpoint is None + and training_args.do_train + and os.path.isdir(training_args.output_dir) + and not training_args.overwrite_output_dir + ): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.") + + if last_checkpoint is not None: + training_args_dict = training_args.to_dict() + training_args_dict.update(dict(resume_from_checkpoint=last_checkpoint)) + training_args = Seq2SeqTrainingArguments(**training_args_dict) + logger.info("Resuming training from {}. Change `output_dir` or use `overwrite_output_dir` to avoid.".format( + training_args.resume_from_checkpoint + )) + + if finetuning_args.stage in ["rm", "ppo"] and training_args.resume_from_checkpoint is not None: + logger.warning("Add {} to `checkpoint_dir` to resume training from checkpoint.".format( + training_args.resume_from_checkpoint + )) + + # postprocess model_args + model_args.compute_dtype = ( + torch.bfloat16 if training_args.bf16 else (torch.float16 if training_args.fp16 else None) + ) + model_args.model_max_length = data_args.cutoff_len + + # Log on each process the small summary: + logger.info("Process rank: {}, device: {}, n_gpu: {}\n distributed training: {}, compute dtype: {}".format( + training_args.local_rank, training_args.device, training_args.n_gpu, + bool(training_args.local_rank != -1), str(model_args.compute_dtype) + )) + logger.info(f"Training/evaluation parameters {training_args}") + + # Set seed before initializing model. + transformers.set_seed(training_args.seed) + + return model_args, data_args, training_args, finetuning_args, generating_args + + +def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS: + model_args, data_args, finetuning_args, generating_args = parse_infer_args(args) + + if data_args.template is None: + raise ValueError("Please specify which `template` to use.") + + _verify_model_args(model_args, finetuning_args) + + return model_args, data_args, finetuning_args, generating_args + + +def get_eval_args(args: Optional[Dict[str, Any]] = None) -> _EVAL_CLS: + model_args, data_args, eval_args, finetuning_args = parse_eval_args(args) + + if data_args.template is None: + raise ValueError("Please specify which `template` to use.") + + _verify_model_args(model_args, finetuning_args) + + transformers.set_seed(eval_args.seed) + + return model_args, data_args, eval_args, finetuning_args diff --git a/LLM-Detector-V4-11w/src/llmtuner/model/utils.py b/LLM-Detector-V4-11w/src/llmtuner/model/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..12a4544566b13113ef3e963529baa0131aaf8fbd --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/model/utils.py @@ -0,0 +1,183 @@ +import torch +import inspect +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple + +from transformers.utils import cached_file +from transformers.trainer import WEIGHTS_NAME, SAFE_WEIGHTS_NAME + +from llmtuner.extras.constants import LAYERNORM_NAMES +from llmtuner.extras.logging import get_logger +from llmtuner.hparams import ModelArguments, FinetuningArguments + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from llmtuner.hparams import DataArguments + + +logger = get_logger(__name__) + + +def dispatch_model(model: "PreTrainedModel") -> "PreTrainedModel": + r""" + Dispatches a pre-trained model to GPUs with balanced memory. + Borrowed from: https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/modeling_utils.py#L2803 + """ + if getattr(model, "quantization_method", None): # already set on current device + return model + + if torch.cuda.device_count() > 1: + from accelerate import dispatch_model + from accelerate.utils import infer_auto_device_map, get_balanced_memory + + if model._no_split_modules is None: + raise ValueError("The model class needs to implement the `_no_split_modules` attribute.") + + kwargs = {"dtype": model.dtype, "no_split_module_classes": model._no_split_modules} + max_memory = get_balanced_memory(model, **kwargs) + # Make sure tied weights are tied before creating the device map. + model.tie_weights() + device_map = infer_auto_device_map(model, max_memory=max_memory, **kwargs) + return dispatch_model(model, device_map) + else: + return model.cuda() + + +def find_all_linear_modules( + model: "PreTrainedModel", + quantization_bit: Optional[int] = None +) -> List[str]: + r""" + Finds all available modules to apply lora. + """ + if quantization_bit is not None: + import bitsandbytes as bnb + linear_cls = bnb.nn.Linear4bit if quantization_bit == 4 else bnb.nn.Linear8bitLt + else: + linear_cls = torch.nn.Linear + + output_layer_names = ["lm_head"] + if model.config.model_type == "chatglm": + output_layer_names.append("output_layer") + + module_names = set() + for name, module in model.named_modules(): + if ( + isinstance(module, linear_cls) + and not any([output_layer in name for output_layer in output_layer_names]) + ): + module_names.add(name.split(".")[-1]) + + logger.info("Found linear modules: {}".format(",".join(module_names))) + return list(module_names) + + +def get_modelcard_args( + model_args: "ModelArguments", + data_args: "DataArguments", + finetuning_args: "FinetuningArguments" +) -> Dict[str, Any]: + return { + "tasks": "text-generation", + "license": "other", + "finetuned_from": model_args.model_name_or_path, + "dataset": [dataset.strip() for dataset in data_args.dataset.split(",")], + "tags": ["llama-factory"] + (["lora"] if finetuning_args.finetuning_type == "lora" else []) + } + + +def load_valuehead_params( + path_or_repo_id: str, + model_args: "ModelArguments" +) -> Dict[str, torch.Tensor]: + r""" + Loads value head parameters from Hugging Face Hub or local disk. + + Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`. + """ + kwargs = { + "path_or_repo_id": path_or_repo_id, + "cache_dir": model_args.cache_dir + } + + if "token" in inspect.signature(cached_file).parameters: + kwargs["token"] = model_args.hf_hub_token + elif "use_auth_token" in inspect.signature(cached_file).parameters: # for transformers==4.31.0 + kwargs["use_auth_token"] = model_args.hf_hub_token + else: + logger.warning("Ignore `hf_hub_token` since matched parameter is not found.") + + try: + vhead_file = cached_file(filename=WEIGHTS_NAME, **kwargs) + return torch.load(vhead_file, map_location="cpu") + except Exception as err: + logger.info("Failed to load {}: {}".format(WEIGHTS_NAME, str(err))) + + try: + from safetensors import safe_open + vhead_file = cached_file(filename=SAFE_WEIGHTS_NAME, **kwargs) + with safe_open(vhead_file, framework="pt", device="cpu") as f: + return { + "v_head.summary.weight": f.get_tensor("v_head.summary.weight"), + "v_head.summary.bias": f.get_tensor("v_head.summary.bias") + } + except Exception as err: + logger.info("Failed to load {}: {}".format(SAFE_WEIGHTS_NAME, str(err))) + + logger.warning("Provided path ({}) does not contain valuehead weights.".format(path_or_repo_id)) + return None + + +def prepare_model_for_training( + model: "PreTrainedModel", + finetuning_args: "FinetuningArguments", + output_layer_name: Optional[str] = "lm_head", + use_gradient_checkpointing: Optional[bool] = True, + layernorm_names: Optional[Set[str]] = LAYERNORM_NAMES +) -> "PreTrainedModel": + r""" + Includes: + (1) cast the layernorm in fp32 + (2) make output embedding layer require grads + (3) upcast the lm_head to fp32 + Inspired by: https://github.com/huggingface/peft/blob/v0.2.0/src/peft/utils/other.py#L33 + """ + if finetuning_args.upcast_layernorm: + for name, param in model.named_parameters(): + if param.ndim == 1 and any(ln_name in name for ln_name in layernorm_names): + param.data = param.data.to(torch.float32) + logger.info("Upcasting weights in layernorm in float32.") + + if finetuning_args.neft_alpha > 1e-6: + def neftune_forward_hook(module: torch.nn.Module, args: Tuple[torch.Tensor], output: torch.Tensor): + if module.training: + dims = torch.tensor(output.size(1) * output.size(2)) + mag_norm = finetuning_args.neft_alpha / torch.sqrt(dims) + output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm) + return output + + model.get_input_embeddings().register_forward_hook(neftune_forward_hook) + logger.info("Using noisy embedding with alpha={:.2f}".format(finetuning_args.neft_alpha)) + + if use_gradient_checkpointing and getattr(model, "supports_gradient_checkpointing", False): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + def make_inputs_require_grad(module: torch.nn.Module, args: Tuple[torch.Tensor], output: torch.Tensor): + output.requires_grad_(True) + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + model.gradient_checkpointing_enable() + model.config.use_cache = False # turn off when gradient checkpointing is enabled + logger.info("Gradient checkpointing enabled.") + + if finetuning_args.finetuning_type != "full" and hasattr(model, output_layer_name): + output_layer = getattr(model, output_layer_name) + if isinstance(output_layer, torch.nn.Linear): + def fp32_forward_pre_hook(module: torch.nn.Module, args: Tuple[torch.Tensor]): + return args[0].to(output_layer.weight.dtype) + def fp32_forward_post_hook(module: torch.nn.Module, args: Tuple[torch.Tensor], output: torch.Tensor): + return output.to(torch.float32) + output_layer.register_forward_pre_hook(fp32_forward_pre_hook) + output_layer.register_forward_hook(fp32_forward_post_hook) + + return model diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/train/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e57c163bef79ac6240fbff326109c8cad5ddfe89 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/__init__.py @@ -0,0 +1 @@ +from llmtuner.train.tuner import export_model, run_exp diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/dpo/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..96c8ed0947f1e5f9938a595f214a36e42adfd9b7 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/__init__.py @@ -0,0 +1 @@ +from llmtuner.train.dpo.workflow import run_dpo diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/dpo/collator.py b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/collator.py new file mode 100644 index 0000000000000000000000000000000000000000..5c862b4f89af2d5cf0c1e32c446c54f21a475b5d --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/collator.py @@ -0,0 +1,51 @@ +import torch +from dataclasses import dataclass +from typing import Any, Dict, List, Sequence, Tuple +from transformers import DataCollatorForSeq2Seq + + +@dataclass +class DPODataCollatorWithPadding(DataCollatorForSeq2Seq): + r""" + Data collator for pairwise data. + """ + + def _pad_labels(self, batch: torch.Tensor, positions: List[Tuple[int, int]]) -> torch.Tensor: + padded_labels = [] + for feature, (prompt_len, answer_len) in zip(batch, positions): + if self.tokenizer.padding_side == "left": + start, end = feature.size(0) - answer_len, feature.size(0) + else: + start, end = prompt_len, prompt_len + answer_len + padded_tensor = self.label_pad_token_id * torch.ones_like(feature) + padded_tensor[start:end] = feature[start:end] + padded_labels.append(padded_tensor) + return torch.stack(padded_labels, dim=0).contiguous() # in contiguous memory + + def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: + r""" + Pads batched data to the longest sequence in the batch. + + We generate 2 * n examples where the first n examples represent chosen examples and + the last n examples represent rejected examples. + """ + concatenated_features = [] + label_positions = [] + for key in ("chosen_ids", "rejected_ids"): + for feature in features: + prompt_len, answer_len = len(feature["prompt_ids"]), len(feature[key]) + concatenated_features.append({ + "input_ids": feature["prompt_ids"] + feature[key], + "attention_mask": [1] * (prompt_len + answer_len) + }) + label_positions.append((prompt_len, answer_len)) + + batch = self.tokenizer.pad( + concatenated_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + batch["labels"] = self._pad_labels(batch["input_ids"], label_positions) + return batch diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/dpo/trainer.py b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..ccf49a7f74a97772a6c6bd76b16d1887a7b3fa3c --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/trainer.py @@ -0,0 +1,75 @@ +import torch +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union +from transformers import BatchEncoding, Trainer +from trl import DPOTrainer +from trl.trainer.utils import disable_dropout_in_model + +from llmtuner.extras.constants import IGNORE_INDEX + +if TYPE_CHECKING: + from transformers import PreTrainedModel + + +class CustomDPOTrainer(DPOTrainer): + + def __init__( + self, + beta: float, + model: Union["PreTrainedModel", torch.nn.Module], + ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]] = None, + disable_dropout: Optional[bool] = True, + loss_type: Optional[Literal["sigmoid", "hinge"]] = "sigmoid", + **kwargs + ): + if disable_dropout: + disable_dropout_in_model(model) + if ref_model is not None: + disable_dropout_in_model(ref_model) + + self.is_encoder_decoder = model.config.is_encoder_decoder + self.ref_model = ref_model + self.use_dpo_data_collator = True # hack to avoid warning + self.generate_during_eval = False # disable at evaluation + self.label_pad_token_id = IGNORE_INDEX + self.padding_value = 0 + self.beta = beta + self.loss_type = loss_type + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + Trainer.__init__(self, model=model, **kwargs) + if not hasattr(self, "accelerator"): + raise AttributeError("Please update `transformers`.") + + if ref_model is not None: + if self.is_deepspeed_enabled: + if not ( + getattr(ref_model, "is_loaded_in_8bit", False) + or getattr(ref_model, "is_loaded_in_4bit", False) + ): # quantized models are already set on the correct device + self.ref_model = self._prepare_deepspeed(self.ref_model) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + def concatenated_forward( + self, + model: Optional[torch.nn.Module] = None, + batch: Optional[Dict[str, torch.Tensor]] = None + ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + batch_copied = BatchEncoding({k: v.detach().clone() for k, v in batch.items()}) # avoid error + + all_logits = model( + input_ids=batch_copied["input_ids"], + attention_mask=batch_copied["attention_mask"], + return_dict=True + ).logits.to(torch.float32) + + all_logps = self._get_batch_logps( + all_logits, + batch["labels"], + average_log_prob=False + ) + batch_size = batch["input_ids"].size(0) // 2 + chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0) + chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) + return chosen_logps, rejected_logps, chosen_logits, rejected_logits diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/dpo/workflow.py b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/workflow.py new file mode 100644 index 0000000000000000000000000000000000000000..6b5a222dc21d9919660264bcfb0d84994ea94ac0 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/dpo/workflow.py @@ -0,0 +1,80 @@ +# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py + +from typing import TYPE_CHECKING, Optional, List +from transformers import Seq2SeqTrainingArguments + +from llmtuner.data import get_dataset, preprocess_dataset, split_dataset +from llmtuner.extras.constants import IGNORE_INDEX +from llmtuner.extras.ploting import plot_loss +from llmtuner.hparams import ModelArguments +from llmtuner.model import load_model_and_tokenizer +from llmtuner.train.dpo.collator import DPODataCollatorWithPadding +from llmtuner.train.dpo.trainer import CustomDPOTrainer +from llmtuner.train.utils import create_modelcard_and_push, create_ref_model + +if TYPE_CHECKING: + from transformers import TrainerCallback + from llmtuner.hparams import DataArguments, FinetuningArguments + + +def run_dpo( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + callbacks: Optional[List["TrainerCallback"]] = None +): + dataset = get_dataset(model_args, data_args) + model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="sft") + dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="rm") + data_collator = DPODataCollatorWithPadding( + tokenizer=tokenizer, + pad_to_multiple_of=4, + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + ) + + # Create reference model + if finetuning_args.ref_model is None and (not training_args.do_train): # use the model itself + ref_model = model + else: + ref_model = create_ref_model(model_args, finetuning_args, stage="dpo") + + # Update arguments + training_args_dict = training_args.to_dict() + training_args_dict.update(dict(remove_unused_columns=False)) # important for pairwise dataset + training_args = Seq2SeqTrainingArguments(**training_args_dict) + + # Initialize our Trainer + trainer = CustomDPOTrainer( + beta=finetuning_args.dpo_beta, + model=model, + ref_model=ref_model, + args=training_args, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=callbacks, + **split_dataset(dataset, data_args, training_args) + ) + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval") + if id(model) == id(ref_model): # unable to compute rewards without a reference model + remove_keys = [key for key in metrics.keys() if "rewards" in key] + for key in remove_keys: + metrics.pop(key) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/ppo/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c32b23fa3fe9fec27c920188d31ffe0849f3a35a --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/__init__.py @@ -0,0 +1 @@ +from llmtuner.train.ppo.workflow import run_ppo diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/ppo/trainer.py b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b81aa7ff5278c5b74a05eb1744579ae3bdfa5f27 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/trainer.py @@ -0,0 +1,359 @@ +import os +import sys +import math +import torch +from tqdm import tqdm +from typing import TYPE_CHECKING, List, Optional, Tuple + +from transformers import BatchEncoding, GenerationConfig, Trainer, TrainerState, TrainerControl +from transformers.utils import WEIGHTS_NAME, SAFE_WEIGHTS_NAME +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR +from transformers.trainer_pt_utils import remove_dummy_checkpoint + +from trl import PPOTrainer +from trl.core import PPODecorators, logprobs_from_logits + +from llmtuner.extras.callbacks import LogCallback, SavePeftModelCallback +from llmtuner.extras.logging import get_logger +from llmtuner.extras.misc import AverageMeter, count_parameters, get_logits_processor +from llmtuner.train.ppo.utils import dump_layernorm, restore_layernorm, replace_model + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + from trl import AutoModelForCausalLMWithValueHead + from llmtuner.hparams import ModelArguments, FinetuningArguments, GeneratingArguments + + +logger = get_logger(__name__) + + +class CustomPPOTrainer(PPOTrainer, Trainer): + r""" + Inherits PPOTrainer. + """ + + def __init__( + self, + model_args: "ModelArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: List["TrainerCallback"], + reward_model: "AutoModelForCausalLMWithValueHead", + **kwargs + ): + PPOTrainer.__init__(self, **kwargs) + + self.args = training_args + self.model_args = model_args + self.finetuning_args = finetuning_args + self.reward_model = reward_model + + self.generation_config = GenerationConfig( + pad_token_id=self.tokenizer.pad_token_id, + eos_token_id=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids, + **generating_args.to_dict() + ) + + self.state = TrainerState() + self.control = TrainerControl() + self.is_deepspeed_enabled = self.accelerator.distributed_type == "DEEPSPEED" and hasattr( + self.accelerator.state, "deepspeed_plugin" + ) + self.log_callback, self.save_callback = callbacks[0], callbacks[1] + assert isinstance(self.log_callback, LogCallback) and isinstance(self.save_callback, SavePeftModelCallback) + + if self.args.max_steps > 0: + logger.info("max_steps is given, it will override any value given in num_train_epochs") + + if reward_model is not None: + if self.is_deepspeed_enabled: + if not ( + getattr(reward_model.pretrained_model, "is_loaded_in_8bit", False) + or getattr(reward_model.pretrained_model, "is_loaded_in_4bit", False) + ): # quantized models are already set on the correct device + self.reward_model = self._prepare_deepspeed(self.reward_model) + else: + self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True) + + def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None: + r""" + Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer. + """ + if resume_from_checkpoint is not None: + raise ValueError("`resume_from_checkpoint` will be supported in the future version.") + + total_train_batch_size = ( + self.args.per_device_train_batch_size + * self.args.gradient_accumulation_steps + * self.finetuning_args.ppo_buffer_size + * self.args.world_size + ) + if self.args.max_steps > 0: + num_examples = total_train_batch_size * self.args.max_steps + num_train_epochs = sys.maxsize + max_steps = self.args.max_steps + steps_in_epoch = self.args.max_steps + else: + len_dataloader = len(self.dataloader) + num_examples = len(self.dataset) + num_train_epochs = self.args.num_train_epochs + max_steps = math.ceil(num_train_epochs * len_dataloader) + steps_in_epoch = len_dataloader + + self.state.max_steps = max_steps + self.state.num_train_epochs = num_train_epochs + self.state.is_local_process_zero = self.is_local_process_zero() + self.state.is_world_process_zero = self.is_world_process_zero() + + if self.is_world_process_zero(): + logger.info("***** Running training *****") + logger.info(" Num examples = {}".format(num_examples)) + logger.info(" Num Epochs = {}".format(num_train_epochs)) + logger.info(" Instantaneous batch size per device = {}".format(self.args.per_device_train_batch_size)) + logger.info(" Total train batch size (w. parallel, buffer, distributed & accumulation) = {}".format( + total_train_batch_size + )) + logger.info(" Gradient Accumulation steps = {}".format(self.args.gradient_accumulation_steps)) + logger.info(" Num optimization epochs per batch = {}".format(self.finetuning_args.ppo_epochs)) + logger.info(" Total training steps = {}".format(max_steps)) + logger.info(" Number of trainable parameters = {}".format(count_parameters(self.model)[0])) + + unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) + dataiter = iter(self.dataloader) + loss_meter = AverageMeter() + reward_meter = AverageMeter() + self.log_callback.on_train_begin(self.args, self.state, self.control) + + for step in tqdm(range(max_steps), disable=not self.is_local_process_zero()): + try: + batch = next(dataiter) + except StopIteration: + dataiter = iter(self.dataloader) + batch = next(dataiter) + + # Cast to inference mode + unwrapped_model.gradient_checkpointing_disable() + unwrapped_model.config.use_cache = True + self.model.eval() + + # Get inputs + self.tokenizer.padding_side = "right" # change padding side + queries, responses, rewards = [], [], [] + for idx in range(0, self.config.batch_size, self.config.mini_batch_size): + mini_batch_queries, mini_batch_responses = self.get_inputs(batch[idx:idx+self.config.mini_batch_size]) + mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses, unwrapped_model) + queries.extend(mini_batch_queries) + responses.extend(mini_batch_responses) + rewards.extend(mini_batch_rewards) + + # Cast to training mode + unwrapped_model.gradient_checkpointing_enable() + unwrapped_model.config.use_cache = False + self.model.train() + + # Run PPO step + stats = self.step(queries, responses, rewards) + self.tokenizer.padding_side = "left" # restore padding side + loss_meter.update(float(stats["ppo/loss/total"]), n=len(rewards)) + reward_meter.update(torch.stack(rewards).mean().item(), n=len(rewards)) + + if self.config.log_with is not None: + try: + batch["query"] = self.tokenizer.batch_decode(queries, skip_special_tokens=True) + batch["response"] = self.tokenizer.batch_decode(responses, skip_special_tokens=True) + self.log_stats(stats, batch, rewards) + except: + logger.warning("Failed to save stats due to unknown errors.") + + self.state.global_step += 1 + self.log_callback.on_step_end(self.args, self.state, self.control) + + if self.is_local_process_zero() and (step+1) % self.args.logging_steps == 0: + logs = dict( + loss=round(loss_meter.avg, 4), + reward=round(reward_meter.avg, 4), + learning_rate=stats["ppo/learning_rate"], + epoch=round(step / steps_in_epoch, 2) + ) + tqdm.write(str(logs)) + logs["step"] = step + self.state.log_history.append(logs) + self.log_callback.on_log(self.args, self.state, self.control) + loss_meter.reset() + reward_meter.reset() + + if (step+1) % self.args.save_steps == 0: # save checkpoint + self.save_model(os.path.join( + self.args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, self.state.global_step) + )) + self.save_callback.on_save( + self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model) + ) + + if self.control.should_epoch_stop or self.control.should_training_stop: + break + + self.log_callback.on_train_end(self.args, self.state, self.control) + self.save_callback.on_train_end( + self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model) + ) + + @torch.no_grad() + def get_inputs(self, batch: BatchEncoding) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + r""" + Generates model's responses given queries. + """ + if self.finetuning_args.upcast_layernorm: + layernorm_params = dump_layernorm(self.model) + + unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) + response: torch.Tensor = unwrapped_model.generate( + generation_config=self.generation_config, + logits_processor=get_logits_processor(), + **batch + ) + + if self.finetuning_args.upcast_layernorm: + restore_layernorm(self.model, layernorm_params) + + query, response = batch["input_ids"].detach().cpu(), response[:, batch["input_ids"].size(-1):].detach().cpu() + queries, responses = [], [] + for i in range(len(query)): + query_length = (query[i] != self.tokenizer.pad_token_id).nonzero()[0].item() + response_index = (response[i] != self.tokenizer.pad_token_id).nonzero() + + if len(response_index) == 0: + response_length = 1 # allow empty response + else: + response_length = response_index[-1].item() + 1 + + queries.append(query[i, query_length:]) # remove padding from left + responses.append(response[i, :response_length]) # remove padding from right + + return queries, responses + + @torch.no_grad() + def get_rewards( + self, + queries: List[torch.Tensor], + responses: List[torch.Tensor], + unwrapped_model: "AutoModelForCausalLMWithValueHead" + ) -> List[torch.Tensor]: + r""" + Computes scores using given reward model. + """ + if self.reward_model is None: + replace_model(unwrapped_model, target="reward") + + batch = self.prepare_model_inputs(queries, responses) + + with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype): # support bf16 + reward_model = self.reward_model if self.reward_model is not None else self.model + _, _, values = reward_model(**batch, output_hidden_states=True, return_dict=True) + + if getattr(unwrapped_model.config, "model_type", None) == "chatglm": + values = torch.transpose(values, 0, 1) + + rewards = [] + for i in range(values.size(0)): + end_indexes = (batch["input_ids"][i] != self.tokenizer.pad_token_id).nonzero() + end_index = end_indexes[-1].item() if len(end_indexes) else 0 + rewards.append(values[i, end_index].float().detach().cpu()) # use fp32 type + + if self.reward_model is None: + replace_model(unwrapped_model, target="default") + + return rewards + + @PPODecorators.empty_device_cache() + def batched_forward_pass( + self, + model: "AutoModelForCausalLMWithValueHead", + queries: torch.Tensor, + responses: torch.Tensor, + model_inputs: dict, + return_logits: Optional[bool] = False, + response_masks: Optional[torch.Tensor] = None + ): + r""" + Calculates model outputs in multiple batches. + + Subclass and override to inject custom behavior. + """ + bs = len(queries) + fbs = self.config.mini_batch_size + all_logprobs = [] + all_logits = [] + all_masks = [] + all_values = [] + + for i in range(math.ceil(bs / fbs)): + input_kwargs = {key: value[i * fbs : (i + 1) * fbs] for key, value in model_inputs.items()} + query_batch = queries[i * fbs : (i + 1) * fbs] + response_batch = responses[i * fbs : (i + 1) * fbs] + if response_masks is not None: + response_masks_batch = response_masks[i * fbs : (i + 1) * fbs] + input_ids = input_kwargs["input_ids"] + attention_mask = input_kwargs["attention_mask"] + + with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype): # support bf16 + logits, _, values = model(**input_kwargs) + + unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) + if getattr(unwrapped_model.config, "model_type", None) == "chatglm": + values = torch.transpose(values, 0, 1) + + logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:]) + masks = torch.zeros_like(attention_mask) + masks[:, :-1] = attention_mask[:, 1:] + + for j in range(len(query_batch)): + start = len(query_batch[j]) - 1 + if attention_mask[j, 0] == 0: # offset left padding + start += attention_mask[j, :].nonzero()[0].item() + end = start + len(response_batch[j]) + + if response_masks is not None: + response_masks_batch = torch.cat( + (torch.zeros_like(query_batch[j]), response_masks_batch[j]) + )[1:] + + masks[j, :start] = 0 + masks[j, end:] = 0 + if response_masks is not None: + masks[j, start:end] = masks[j, start:end] * response_masks_batch[j][start:end] + + if return_logits: + all_logits.append(logits) + else: + del logits + + all_values.append(values) + all_logprobs.append(logprobs) + all_masks.append(masks) + + return ( + torch.cat(all_logprobs), + torch.cat(all_logits)[:, :-1] if return_logits else None, + torch.cat(all_values)[:, :-1], + torch.cat(all_masks)[:, :-1], + ) + + def save_model(self, output_dir: Optional[str] = None) -> None: + r""" + Saves model checkpoint. + + Subclass and override to inject custom behavior. + """ + if self.args.should_save: + try: + self._save(output_dir, state_dict=self.accelerator.get_state_dict(self.model)) + except ValueError: + logger.warning( + " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use" + " zero_to_fp32.py to recover weights" + ) + self._save(output_dir, state_dict={}) + remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME]) + self.model.save_checkpoint(output_dir) # wrapped model diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/ppo/utils.py b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..74453a39c9f1be0fdedfd7b504e5e94bbeb461cd --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/utils.py @@ -0,0 +1,35 @@ +import torch +from typing import TYPE_CHECKING, Dict, Literal, Optional + +if TYPE_CHECKING: + from transformers import PreTrainedModel + from trl import AutoModelForCausalLMWithValueHead + + +def replace_model(model: "AutoModelForCausalLMWithValueHead", target: Literal["default", "reward"]) -> None: + if target == "reward": # save default head temporarily + valuehead_state_dict: Dict[str, torch.Tensor] = model.v_head.state_dict() + setattr(model, "default_head_weight", valuehead_state_dict["summary.weight"].detach().clone()) + setattr(model, "default_head_bias", valuehead_state_dict["summary.bias"].detach().clone()) + + model.pretrained_model.set_adapter(target) # set the LoRA adapter to be active + model.v_head.load_state_dict({ + "summary.weight": model.get_buffer("{}_head_weight".format(target)).detach().clone(), + "summary.bias": model.get_buffer("{}_head_bias".format(target)).detach().clone() + }) + + +def dump_layernorm(model: "PreTrainedModel") -> Dict[str, torch.Tensor]: + layer_norm_params = {} + for name, param in model.named_parameters(): + if param.data.dtype == torch.float32: + layer_norm_params[name] = param.data.detach().clone() + param.data = param.data.to(model.config.torch_dtype) + + return layer_norm_params + + +def restore_layernorm(model: "PreTrainedModel", layernorm_params: Optional[Dict[str, torch.Tensor]] = None) -> None: + for name, param in model.named_parameters(): + if name in layernorm_params: + param.data = layernorm_params[name] diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/ppo/workflow.py b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/workflow.py new file mode 100644 index 0000000000000000000000000000000000000000..88d5e49d3fcef3a4d24da0ba3bcb6b47d1c1cdcd --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/ppo/workflow.py @@ -0,0 +1,100 @@ +# Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py + +import math +from trl import PPOConfig +from torch.optim import AdamW +from typing import TYPE_CHECKING, Optional, List +from transformers import DataCollatorWithPadding +from transformers.optimization import get_scheduler + +from llmtuner.data import get_dataset, preprocess_dataset +from llmtuner.extras.callbacks import SavePeftModelCallback +from llmtuner.extras.ploting import plot_loss +from llmtuner.model import load_model_and_tokenizer +from llmtuner.train.utils import create_ref_model, create_reward_model +from llmtuner.train.ppo.trainer import CustomPPOTrainer + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + from llmtuner.hparams import ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments + + +def run_ppo( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: Optional[List["TrainerCallback"]] = None +): + dataset = get_dataset(model_args, data_args) + model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="ppo") + dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="ppo") + + tokenizer.padding_side = "left" # use left-padding in generation while using right-padding in training + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + # Create reference model and reward model + ref_model = create_ref_model(model_args, finetuning_args, stage="ppo") + reward_model = create_reward_model(model, model_args, finetuning_args) + + # Create ppo config + backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps + ppo_config = PPOConfig( + model_name=model_args.model_name_or_path, + learning_rate=training_args.learning_rate, + mini_batch_size=training_args.per_device_train_batch_size, + batch_size=backward_batch_size * finetuning_args.ppo_buffer_size, + gradient_accumulation_steps=training_args.gradient_accumulation_steps, + ppo_epochs=finetuning_args.ppo_epochs, + max_grad_norm=training_args.max_grad_norm, + seed=training_args.seed, + optimize_device_cache=True, + target=finetuning_args.ppo_target, + log_with=finetuning_args.ppo_logger, + use_score_scaling=finetuning_args.ppo_score_norm, + use_score_norm=finetuning_args.ppo_score_norm, + whiten_rewards=finetuning_args.ppo_whiten_rewards, + accelerator_kwargs={"step_scheduler_with_optimizer": False} + ) + + # Create optimizer and scheduler + optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=training_args.learning_rate) + if training_args.max_steps > 0: + num_training_steps = training_args.max_steps + else: + total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size + num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size) + + lr_scheduler = get_scheduler( + training_args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=training_args.get_warmup_steps(num_training_steps), + num_training_steps=num_training_steps + ) + + # Initialize our Trainer + ppo_trainer = CustomPPOTrainer( + model_args=model_args, + training_args=training_args, + finetuning_args=finetuning_args, + generating_args=generating_args, + callbacks=callbacks + [SavePeftModelCallback()], + reward_model=reward_model, + config=ppo_config, + model=model, + ref_model=ref_model, + tokenizer=tokenizer, + dataset=dataset, + data_collator=data_collator, + optimizer=optimizer, + lr_scheduler=lr_scheduler + ) + + # Training + if training_args.do_train: + ppo_trainer.ppo_train(resume_from_checkpoint=training_args.resume_from_checkpoint) + ppo_trainer.save_model() + ppo_trainer.save_state() # must be called after save_model to have a folder + if ppo_trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "reward"]) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/pt/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/train/pt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eacbeadb461af572ca7554c163b9e6b773388a8d --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/pt/__init__.py @@ -0,0 +1 @@ +from llmtuner.train.pt.workflow import run_pt diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/pt/workflow.py b/LLM-Detector-V4-11w/src/llmtuner/train/pt/workflow.py new file mode 100644 index 0000000000000000000000000000000000000000..eadfa0019593b7fa65f1072c98b8ce7c8b19675c --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/pt/workflow.py @@ -0,0 +1,62 @@ +# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/language-modeling/run_clm.py + +import math +from typing import TYPE_CHECKING, Optional, List +from transformers import DataCollatorForLanguageModeling, Trainer + +from llmtuner.data import get_dataset, preprocess_dataset, split_dataset +from llmtuner.extras.ploting import plot_loss +from llmtuner.model import load_model_and_tokenizer +from llmtuner.train.utils import create_modelcard_and_push + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + from llmtuner.hparams import ModelArguments, DataArguments, FinetuningArguments + + +def run_pt( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + callbacks: Optional[List["TrainerCallback"]] = None +): + dataset = get_dataset(model_args, data_args) + model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="pt") + dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="pt") + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=callbacks, + **split_dataset(dataset, data_args, training_args) + ) + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval") + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + + metrics["perplexity"] = perplexity + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/rm/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/train/rm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c80ccfb9f16ad0a74e56762faa6d435c53b1ba25 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/rm/__init__.py @@ -0,0 +1 @@ +from llmtuner.train.rm.workflow import run_rm diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/rm/collator.py b/LLM-Detector-V4-11w/src/llmtuner/train/rm/collator.py new file mode 100644 index 0000000000000000000000000000000000000000..161f003d0ac106a203bd04418676883d943a4da7 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/rm/collator.py @@ -0,0 +1,27 @@ +import torch +from dataclasses import dataclass +from typing import Any, Dict, Sequence +from transformers import DataCollatorWithPadding + + +@dataclass +class PairwiseDataCollatorWithPadding(DataCollatorWithPadding): + r""" + Data collator for pairwise data. + """ + + def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: + r""" + Pads batched data to the longest sequence in the batch. + + We generate 2 * n examples where the first n examples represent chosen examples and + the last n examples represent rejected examples. + """ + features = [ + { + "input_ids": feature["prompt_ids"] + feature[key], + "attention_mask": [1] * (len(feature["prompt_ids"]) + len(feature[key])) + } + for key in ("chosen_ids", "rejected_ids") for feature in features + ] + return super().__call__(features) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/rm/metric.py b/LLM-Detector-V4-11w/src/llmtuner/train/rm/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..db9c924304a11aaa1babce8c7820d49b3828a046 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/rm/metric.py @@ -0,0 +1,7 @@ +import numpy as np +from typing import Dict, Sequence, Tuple, Union + + +def compute_accuracy(eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: + preds, _ = eval_preds + return {"accuracy": (preds[0] > preds[1]).sum() / len(preds[0])} diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/rm/trainer.py b/LLM-Detector-V4-11w/src/llmtuner/train/rm/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b018a8c4d09c70c4c597c79aae5f605ba8fe4cc9 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/rm/trainer.py @@ -0,0 +1,103 @@ +import os +import json +import torch +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from transformers import Trainer + +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from transformers.trainer import PredictionOutput + from transformers.modeling_utils import PreTrainedModel + + +logger = get_logger(__name__) + + +class PairwiseTrainer(Trainer): + r""" + Inherits PeftTrainer to compute pairwise loss. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.can_return_loss = True # override property to return eval_loss + + def compute_loss( + self, + model: "PreTrainedModel", + inputs: Dict[str, torch.Tensor], + return_outputs: Optional[bool] = False + ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]: + r""" + Computes pairwise loss. The first n examples are chosen and the last n examples are rejected. + + Subclass and override to inject custom behavior. + + Note that the first element will be removed from the output tuple. + See: https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/trainer.py#L3509 + """ + # Compute rewards + _, _, values = model(**inputs, output_hidden_states=True, return_dict=True) + + unwrapped_model: "PreTrainedModel" = self.accelerator.unwrap_model(self.model) + if getattr(unwrapped_model.config, "model_type", None) == "chatglm": + values = torch.transpose(values, 0, 1) + + # Split the inputs and rewards into two parts, chosen and rejected + batch_size = inputs["input_ids"].size(0) // 2 + chosen_input_ids, rejected_input_ids = inputs["input_ids"][:batch_size], inputs["input_ids"][batch_size:] + chosen_rewards, rejected_rewards = values[:batch_size], values[batch_size:] + chosen_scores, rejected_scores = [], [] + + # Compute pairwise loss. Only backprop on the different tokens before padding + # Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py + loss = 0 + for i in range(batch_size): + chosen_length = (chosen_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1 + rejected_length = (rejected_input_ids[i] != self.tokenizer.pad_token_id).nonzero()[-1] + 1 + check_divergence = (chosen_input_ids[i] != rejected_input_ids[i]).nonzero() + + if len(check_divergence) == 0: + end_index = chosen_length + div_index = end_index - 1 + else: + end_index = max(chosen_length, rejected_length) + div_index = check_divergence[0] + + assert div_index > 0 + chosen_trunc_rewards = chosen_rewards[i, div_index:end_index] + rejected_trunc_rewards = rejected_rewards[i, div_index:end_index] + if return_outputs: # use the score on the last token except pad token for inference + chosen_scores.append(chosen_rewards[i, chosen_length-1]) + rejected_scores.append(rejected_rewards[i, rejected_length-1]) + loss += -torch.nn.functional.logsigmoid(chosen_trunc_rewards - rejected_trunc_rewards).mean() + + loss = loss / batch_size + if return_outputs: + chosen_scores, rejected_scores = torch.stack(chosen_scores), torch.stack(rejected_scores) + return loss, [loss, chosen_scores, rejected_scores] + + return loss + + def save_predictions( + self, + predict_results: "PredictionOutput" + ) -> None: + r""" + Saves model predictions to `output_dir`. + + A custom behavior that not contained in Seq2SeqTrainer. + """ + if not self.is_world_process_zero(): + return + + output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl") + logger.info(f"Saving prediction results to {output_prediction_file}") + chosen_scores, rejected_scores = predict_results.predictions + + with open(output_prediction_file, "w", encoding="utf-8") as writer: + res: List[str] = [] + for c_score, r_score in zip(chosen_scores, rejected_scores): + res.append(json.dumps({"chosen": round(float(c_score), 2), "rejected": round(float(r_score), 2)})) + writer.write("\n".join(res)) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/rm/workflow.py b/LLM-Detector-V4-11w/src/llmtuner/train/rm/workflow.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc409b770eb5acf940c46bf44b7c8742010375b --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/rm/workflow.py @@ -0,0 +1,72 @@ +# Inspired by: https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/train_reward_model_gptj.py + +from typing import TYPE_CHECKING, Optional, List +from transformers import Seq2SeqTrainingArguments + +from llmtuner.data import get_dataset, preprocess_dataset, split_dataset +from llmtuner.extras.callbacks import SavePeftModelCallback +from llmtuner.extras.ploting import plot_loss +from llmtuner.model import load_model_and_tokenizer +from llmtuner.train.rm.collator import PairwiseDataCollatorWithPadding +from llmtuner.train.rm.metric import compute_accuracy +from llmtuner.train.rm.trainer import PairwiseTrainer +from llmtuner.train.utils import create_modelcard_and_push + +if TYPE_CHECKING: + from transformers import TrainerCallback + from llmtuner.hparams import ModelArguments, DataArguments, FinetuningArguments + + +def run_rm( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + callbacks: Optional[List["TrainerCallback"]] = None +): + dataset = get_dataset(model_args, data_args) + model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="rm") + dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="rm") + data_collator = PairwiseDataCollatorWithPadding(tokenizer, pad_to_multiple_of=4) + + # Update arguments + training_args_dict = training_args.to_dict() + training_args_dict.update(dict(remove_unused_columns=False)) # important for pairwise dataset + training_args = Seq2SeqTrainingArguments(**training_args_dict) + + # Initialize our Trainer + trainer = PairwiseTrainer( + model=model, + args=training_args, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=callbacks + [SavePeftModelCallback()], + compute_metrics=compute_accuracy, + **split_dataset(dataset, data_args, training_args) + ) + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval") + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Predict + if training_args.do_predict: + predict_results = trainer.predict(dataset, metric_key_prefix="predict") + trainer.log_metrics("predict", predict_results.metrics) + trainer.save_metrics("predict", predict_results.metrics) + trainer.save_predictions(predict_results) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/sft/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/train/sft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb5448f473d0930fe14097b0c9b862cc4ce7393b --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/sft/__init__.py @@ -0,0 +1 @@ +from llmtuner.train.sft.workflow import run_sft diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/sft/metric.py b/LLM-Detector-V4-11w/src/llmtuner/train/sft/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..18db0b88a7d5b37cf80d6386dfab7c529f8bb44b --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/sft/metric.py @@ -0,0 +1,61 @@ +import numpy as np +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union + +from llmtuner.extras.constants import IGNORE_INDEX +from llmtuner.extras.packages import ( + is_jieba_available, is_nltk_available, is_rouge_available +) + +if TYPE_CHECKING: + from transformers.tokenization_utils import PreTrainedTokenizer + +if is_jieba_available(): + import jieba + +if is_nltk_available(): + from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction + +if is_rouge_available(): + from rouge_chinese import Rouge + + +@dataclass +class ComputeMetrics: + r""" + Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. + """ + + tokenizer: "PreTrainedTokenizer" + + def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: + r""" + Uses the model predictions to compute metrics. + """ + preds, labels = eval_preds + score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []} + + preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id) + labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id) + + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + + for pred, label in zip(decoded_preds, decoded_labels): + hypothesis = list(jieba.cut(pred)) + reference = list(jieba.cut(label)) + + if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: + result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} + else: + rouge = Rouge() + scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) + result = scores[0] + + for k, v in result.items(): + score_dict[k].append(round(v["f"] * 100, 4)) + + bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) + score_dict["bleu-4"].append(round(bleu_score * 100, 4)) + + return {k: float(np.mean(v)) for k, v in score_dict.items()} diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/sft/trainer.py b/LLM-Detector-V4-11w/src/llmtuner/train/sft/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..291bbc7adcc20cfacc8e73046721a5eda008564e --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/sft/trainer.py @@ -0,0 +1,97 @@ +import os +import json +import torch +import numpy as np +import torch.nn as nn +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from transformers import Seq2SeqTrainer + +from llmtuner.extras.constants import IGNORE_INDEX +from llmtuner.extras.logging import get_logger + +if TYPE_CHECKING: + from transformers.trainer import PredictionOutput + + +logger = get_logger(__name__) + + +class CustomSeq2SeqTrainer(Seq2SeqTrainer): + r""" + Inherits PeftTrainer to compute generative metrics such as BLEU and ROUGE. + """ + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + r""" + Removes the prompt part in the generated tokens. + + Subclass and override to inject custom behavior. + """ + labels = inputs["labels"].detach().clone() if "labels" in inputs else None # backup labels + if self.args.predict_with_generate: + assert self.tokenizer.padding_side == "left", "This method only accepts left-padded tensor." + prompt_len, label_len = inputs["input_ids"].size(-1), inputs["labels"].size(-1) + if prompt_len > label_len: + inputs["labels"] = self._pad_tensors_to_target_len(inputs["labels"], inputs["input_ids"]) + if label_len > prompt_len: # truncate the labels instead of padding the inputs (llama2 fp16 compatibility) + inputs["labels"] = inputs["labels"][:, :prompt_len] + + loss, generated_tokens, _ = super().prediction_step( # ignore the returned labels (may be truncated) + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + if generated_tokens is not None and self.args.predict_with_generate: + generated_tokens[:, :prompt_len] = self.tokenizer.pad_token_id + generated_tokens = generated_tokens.contiguous() + + return loss, generated_tokens, labels + + def _pad_tensors_to_target_len( + self, + src_tensor: torch.Tensor, + tgt_tensor: torch.Tensor + ) -> torch.Tensor: + r""" + Pads the tensor to the same length as the target tensor. + """ + assert self.tokenizer.pad_token_id is not None, "Pad token is required." + padded_tensor = self.tokenizer.pad_token_id * torch.ones_like(tgt_tensor) + padded_tensor[:, -src_tensor.shape[-1]:] = src_tensor # adopt left-padding + return padded_tensor.contiguous() # in contiguous memory + + def save_predictions( + self, + predict_results: "PredictionOutput" + ) -> None: + r""" + Saves model predictions to `output_dir`. + + A custom behavior that not contained in Seq2SeqTrainer. + """ + if not self.is_world_process_zero(): + return + + output_prediction_file = os.path.join(self.args.output_dir, "generated_predictions.jsonl") + logger.info(f"Saving prediction results to {output_prediction_file}") + + labels = np.where(predict_results.label_ids != IGNORE_INDEX, predict_results.label_ids, self.tokenizer.pad_token_id) + preds = np.where(predict_results.predictions != IGNORE_INDEX, predict_results.predictions, self.tokenizer.pad_token_id) + + for i in range(len(preds)): + pad_len = np.nonzero(preds[i] != self.tokenizer.pad_token_id)[0] + if len(pad_len): + preds[i] = np.concatenate((preds[i][pad_len[0]:], preds[i][:pad_len[0]]), axis=-1) # move pad token to last + + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False) + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True) + + with open(output_prediction_file, "w", encoding="utf-8") as writer: + res: List[str] = [] + for label, pred in zip(decoded_labels, decoded_preds): + res.append(json.dumps({"label": label, "predict": pred}, ensure_ascii=False)) + writer.write("\n".join(res)) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/sft/workflow.py b/LLM-Detector-V4-11w/src/llmtuner/train/sft/workflow.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5049036e5c901db94ac1f8daaf0c6cb6496bf3 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/sft/workflow.py @@ -0,0 +1,94 @@ +# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py + +from typing import TYPE_CHECKING, Optional, List +from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments + +from llmtuner.data import get_dataset, preprocess_dataset, split_dataset +from llmtuner.extras.constants import IGNORE_INDEX +from llmtuner.extras.misc import get_logits_processor +from llmtuner.extras.ploting import plot_loss +from llmtuner.model import load_model_and_tokenizer +from llmtuner.train.sft.metric import ComputeMetrics +from llmtuner.train.sft.trainer import CustomSeq2SeqTrainer +from llmtuner.train.utils import create_modelcard_and_push + +if TYPE_CHECKING: + from transformers import TrainerCallback + from llmtuner.hparams import ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments + + +def run_sft( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + callbacks: Optional[List["TrainerCallback"]] = None +): + dataset = get_dataset(model_args, data_args) + model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, training_args.do_train, stage="sft") + dataset = preprocess_dataset(dataset, tokenizer, data_args, training_args, stage="sft") + + if training_args.predict_with_generate: + tokenizer.padding_side = "left" # use left-padding in generation + + data_collator = DataCollatorForSeq2Seq( + tokenizer=tokenizer, + pad_to_multiple_of=4 if tokenizer.padding_side == "right" else None, # for shift short attention + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + ) + + # Override the decoding parameters of Seq2SeqTrainer + training_args_dict = training_args.to_dict() + training_args_dict.update(dict( + generation_max_length=training_args.generation_max_length or data_args.cutoff_len, + generation_num_beams=data_args.eval_num_beams or training_args.generation_num_beams + )) + training_args = Seq2SeqTrainingArguments(**training_args_dict) + + # Initialize our Trainer + trainer = CustomSeq2SeqTrainer( + model=model, + args=training_args, + tokenizer=tokenizer, + data_collator=data_collator, + callbacks=callbacks, + compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + **split_dataset(dataset, data_args, training_args) + ) + + # Keyword arguments for `model.generate` + gen_kwargs = generating_args.to_dict() + gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids + gen_kwargs["pad_token_id"] = tokenizer.pad_token_id + gen_kwargs["logits_processor"] = get_logits_processor() + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) + if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled + metrics.pop("eval_loss", None) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Predict + if training_args.do_predict: + predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) + if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled + predict_results.metrics.pop("predict_loss", None) + trainer.log_metrics("predict", predict_results.metrics) + trainer.save_metrics("predict", predict_results.metrics) + trainer.save_predictions(predict_results) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/tuner.py b/LLM-Detector-V4-11w/src/llmtuner/train/tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..094aa50f3a4d6046ea1a7d54a1e7ac121f45fb01 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/tuner.py @@ -0,0 +1,56 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from llmtuner.extras.callbacks import LogCallback +from llmtuner.extras.logging import get_logger +from llmtuner.model import get_train_args, get_infer_args, load_model_and_tokenizer +from llmtuner.train.pt import run_pt +from llmtuner.train.sft import run_sft +from llmtuner.train.rm import run_rm +from llmtuner.train.ppo import run_ppo +from llmtuner.train.dpo import run_dpo + +if TYPE_CHECKING: + from transformers import TrainerCallback + + +logger = get_logger(__name__) + + +def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None): + model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args) + callbacks = [LogCallback()] if callbacks is None else callbacks + + if finetuning_args.stage == "pt": + run_pt(model_args, data_args, training_args, finetuning_args, callbacks) + elif finetuning_args.stage == "sft": + run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) + elif finetuning_args.stage == "rm": + run_rm(model_args, data_args, training_args, finetuning_args, callbacks) + elif finetuning_args.stage == "ppo": + run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) + elif finetuning_args.stage == "dpo": + run_dpo(model_args, data_args, training_args, finetuning_args, callbacks) + else: + raise ValueError("Unknown task.") + + +def export_model(args: Optional[Dict[str, Any]] = None): + model_args, _, finetuning_args, _ = get_infer_args(args) + model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args) + + if getattr(model, "quantization_method", None) in ["gptq", "awq"]: + raise ValueError("Cannot export a GPTQ or AWQ quantized model.") + + model.config.use_cache = True + model.save_pretrained(finetuning_args.export_dir, max_shard_size="{}GB".format(finetuning_args.export_size)) + + try: + tokenizer.padding_side = "left" # restore padding side + tokenizer.init_kwargs["padding_side"] = "left" + tokenizer.save_pretrained(finetuning_args.export_dir) + except: + logger.warning("Cannot save tokenizer, please copy the files manually.") + + +if __name__ == "__main__": + run_exp() diff --git a/LLM-Detector-V4-11w/src/llmtuner/train/utils.py b/LLM-Detector-V4-11w/src/llmtuner/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6b40f33be8d47725bd5479e39152d5c19790db9f --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/train/utils.py @@ -0,0 +1,99 @@ +import torch +from typing import TYPE_CHECKING, Literal, Union + +from llmtuner.extras.logging import get_logger +from llmtuner.hparams import ModelArguments, FinetuningArguments +from llmtuner.model import get_modelcard_args, load_model_and_tokenizer, load_valuehead_params + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, Trainer + from transformers.modeling_utils import PreTrainedModel + from trl import AutoModelForCausalLMWithValueHead + from llmtuner.hparams import DataArguments + + +logger = get_logger(__name__) + + +def create_modelcard_and_push( + trainer: "Trainer", + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments" +) -> None: + if training_args.do_train: + if training_args.push_to_hub: + trainer.push_to_hub(**get_modelcard_args(model_args, data_args, finetuning_args)) + return + try: + trainer.create_model_card(**get_modelcard_args(model_args, data_args, finetuning_args)) + except Exception as err: + logger.warning("Failed to create model card: {}".format(str(err))) + + +def create_ref_model( + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + stage: Literal["ppo", "dpo"] +) -> Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]: + r""" + Creates reference model for PPO/DPO training. Evaluation mode is not supported. + + The valuehead parameter is randomly initialized since it is useless for PPO training. + """ + if finetuning_args.ref_model is not None: + ref_model_args_dict = model_args.to_dict() + ref_model_args_dict.update(dict( + model_name_or_path=finetuning_args.ref_model, + checkpoint_dir=finetuning_args.ref_model_checkpoint, + quantization_bit=finetuning_args.ref_model_quantization_bit + )) + ref_model_args = ModelArguments(**ref_model_args_dict) + ref_finetuning_args = FinetuningArguments(finetuning_type="lora") + ref_model, _ = load_model_and_tokenizer(ref_model_args, ref_finetuning_args, is_trainable=False, stage=stage) + logger.info("Created reference model from {}".format(finetuning_args.ref_model)) + else: + if finetuning_args.finetuning_type == "lora": + ref_model = None + else: + ref_model, _ = load_model_and_tokenizer(model_args, finetuning_args, is_trainable=False, stage=stage) + logger.info("Created reference model from the model itself.") + + return ref_model + + +def create_reward_model( + model: "AutoModelForCausalLMWithValueHead", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments" +) -> "AutoModelForCausalLMWithValueHead": + r""" + Creates reward model for PPO training. + """ + if finetuning_args.reward_model_type == "lora": + model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward") + for name, param in model.named_parameters(): # https://github.com/huggingface/peft/issues/1090 + if "default" in name: + param.data = param.data.to(torch.float32) # trainable params should in fp32 + vhead_params = load_valuehead_params(finetuning_args.reward_model, model_args) + assert vhead_params is not None, "Reward model is not correctly loaded." + model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False) + model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False) + model.register_buffer("default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False) + model.register_buffer("default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False) + logger.info("Loaded adapter weights of reward model from {}".format(finetuning_args.reward_model)) + return None + else: + reward_model_args_dict = model_args.to_dict() + reward_model_args_dict.update(dict( + model_name_or_path=finetuning_args.reward_model, + checkpoint_dir=finetuning_args.reward_model_checkpoint, + quantization_bit=finetuning_args.reward_model_quantization_bit + )) + reward_model_args = ModelArguments(**reward_model_args_dict) + reward_finetuning_args = FinetuningArguments(finetuning_type="lora") + reward_model, _ = load_model_and_tokenizer(reward_model_args, reward_finetuning_args, is_trainable=False, stage="ppo") + logger.info("Load full weights of reward model from {}".format(finetuning_args.reward_model)) + logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.") + return reward_model diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/webui/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a27c7f6ea0d98ad23b3f5b239d678748f0d38f76 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/__init__.py @@ -0,0 +1 @@ +from llmtuner.webui.interface import create_ui, create_web_demo diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/chatter.py b/LLM-Detector-V4-11w/src/llmtuner/webui/chatter.py new file mode 100644 index 0000000000000000000000000000000000000000..ddf80b278299e39793d5659ea1deef02d644f834 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/chatter.py @@ -0,0 +1,128 @@ +import gradio as gr +from gradio.components import Component # cannot use TYPE_CHECKING here +from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple + +from llmtuner.chat import ChatModel +from llmtuner.extras.misc import torch_gc +from llmtuner.hparams import GeneratingArguments +from llmtuner.webui.common import get_save_dir +from llmtuner.webui.locales import ALERTS + +if TYPE_CHECKING: + from llmtuner.webui.manager import Manager + + +class WebChatModel(ChatModel): + + def __init__( + self, + manager: "Manager", + demo_mode: Optional[bool] = False, + lazy_init: Optional[bool] = True + ) -> None: + self.manager = manager + self.demo_mode = demo_mode + self.model = None + self.tokenizer = None + self.generating_args = GeneratingArguments() + + if not lazy_init: # read arguments from command line + super().__init__() + + if demo_mode: # load demo_config.json if exists + import json + try: + with open("demo_config.json", "r", encoding="utf-8") as f: + args = json.load(f) + assert args.get("model_name_or_path", None) and args.get("template", None) + super().__init__(args) + except AssertionError: + print("Please provided model name and template in `demo_config.json`.") + except: + print("Cannot find `demo_config.json` at current directory.") + + @property + def loaded(self) -> bool: + return self.model is not None + + def load_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]: + get = lambda name: data[self.manager.get_elem_by_name(name)] + lang = get("top.lang") + error = "" + if self.loaded: + error = ALERTS["err_exists"][lang] + elif not get("top.model_name"): + error = ALERTS["err_no_model"][lang] + elif not get("top.model_path"): + error = ALERTS["err_no_path"][lang] + elif self.demo_mode: + error = ALERTS["err_demo"][lang] + + if error: + gr.Warning(error) + yield error + return + + if get("top.checkpoints"): + checkpoint_dir = ",".join([ + get_save_dir(get("top.model_name"), get("top.finetuning_type"), ckpt) for ckpt in get("top.checkpoints") + ]) + else: + checkpoint_dir = None + + yield ALERTS["info_loading"][lang] + args = dict( + model_name_or_path=get("top.model_path"), + checkpoint_dir=checkpoint_dir, + finetuning_type=get("top.finetuning_type"), + quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + template=get("top.template"), + system_prompt=get("top.system_prompt"), + flash_attn=get("top.flash_attn"), + shift_attn=get("top.shift_attn"), + rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None + ) + super().__init__(args) + + yield ALERTS["info_loaded"][lang] + + def unload_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]: + lang = data[self.manager.get_elem_by_name("top.lang")] + + if self.demo_mode: + gr.Warning(ALERTS["err_demo"][lang]) + yield ALERTS["err_demo"][lang] + return + + yield ALERTS["info_unloading"][lang] + self.model = None + self.tokenizer = None + torch_gc() + yield ALERTS["info_unloaded"][lang] + + def predict( + self, + chatbot: List[Tuple[str, str]], + query: str, + history: List[Tuple[str, str]], + system: str, + max_new_tokens: int, + top_p: float, + temperature: float + ) -> Generator[Tuple[List[Tuple[str, str]], List[Tuple[str, str]]], None, None]: + chatbot.append([query, ""]) + response = "" + for new_text in self.stream_chat( + query, history, system, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature + ): + response += new_text + new_history = history + [(query, response)] + chatbot[-1] = [query, self.postprocess(response)] + yield chatbot, new_history + + def postprocess(self, response: str) -> str: + blocks = response.split("```") + for i, block in enumerate(blocks): + if i % 2 == 0: + blocks[i] = block.replace("<", "<").replace(">", ">") + return "```".join(blocks) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/common.py b/LLM-Detector-V4-11w/src/llmtuner/webui/common.py new file mode 100644 index 0000000000000000000000000000000000000000..ab2502e1aaf122a853e0ea7abfa492fa16d4c1e5 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/common.py @@ -0,0 +1,122 @@ +import os +import json +import gradio as gr +from typing import Any, Dict, Optional +from transformers.utils import ( + WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + ADAPTER_WEIGHTS_NAME, + ADAPTER_SAFE_WEIGHTS_NAME +) + +from llmtuner.extras.constants import ( + DEFAULT_MODULE, + DEFAULT_TEMPLATE, + SUPPORTED_MODELS, + TRAINING_STAGES, + DownloadSource +) +from llmtuner.extras.misc import use_modelscope +from llmtuner.hparams.data_args import DATA_CONFIG + + +DEFAULT_CACHE_DIR = "cache" +DEFAULT_DATA_DIR = "data" +DEFAULT_SAVE_DIR = "saves" +USER_CONFIG = "user.config" +CKPT_NAMES = [ + WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + ADAPTER_WEIGHTS_NAME, + ADAPTER_SAFE_WEIGHTS_NAME +] + + +def get_save_dir(*args) -> os.PathLike: + return os.path.join(DEFAULT_SAVE_DIR, *args) + + +def get_config_path() -> os.PathLike: + return os.path.join(DEFAULT_CACHE_DIR, USER_CONFIG) + + +def load_config() -> Dict[str, Any]: + try: + with open(get_config_path(), "r", encoding="utf-8") as f: + return json.load(f) + except: + return {"lang": None, "last_model": None, "path_dict": {}, "cache_dir": None} + + +def save_config(lang: str, model_name: Optional[str] = None, model_path: Optional[str] = None) -> None: + os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True) + user_config = load_config() + user_config["lang"] = lang or user_config["lang"] + if model_name: + user_config["last_model"] = model_name + user_config["path_dict"][model_name] = model_path + with open(get_config_path(), "w", encoding="utf-8") as f: + json.dump(user_config, f, indent=2, ensure_ascii=False) + + +def get_model_path(model_name: str) -> str: + user_config = load_config() + path_dict: Dict[DownloadSource, str] = SUPPORTED_MODELS.get(model_name, []) + model_path = user_config["path_dict"].get(model_name, None) or path_dict.get(DownloadSource.DEFAULT, "") + if ( + use_modelscope() + and path_dict.get(DownloadSource.MODELSCOPE) + and model_path == path_dict.get(DownloadSource.DEFAULT) + ): # replace path + model_path = path_dict.get(DownloadSource.MODELSCOPE) + return model_path + + +def get_prefix(model_name: str) -> str: + return model_name.split("-")[0] + + +def get_module(model_name: str) -> str: + return DEFAULT_MODULE.get(get_prefix(model_name), "q_proj,v_proj") + + +def get_template(model_name: str) -> str: + if model_name and model_name.endswith("Chat") and get_prefix(model_name) in DEFAULT_TEMPLATE: + return DEFAULT_TEMPLATE[get_prefix(model_name)] + return "default" + + +def list_checkpoint(model_name: str, finetuning_type: str) -> Dict[str, Any]: + checkpoints = [] + if model_name: + save_dir = get_save_dir(model_name, finetuning_type) + if save_dir and os.path.isdir(save_dir): + for checkpoint in os.listdir(save_dir): + if ( + os.path.isdir(os.path.join(save_dir, checkpoint)) + and any([os.path.isfile(os.path.join(save_dir, checkpoint, name)) for name in CKPT_NAMES]) + ): + checkpoints.append(checkpoint) + return gr.update(value=[], choices=checkpoints) + + +def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]: + try: + with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: + return json.load(f) + except Exception as err: + print("Cannot open {} due to {}.".format(os.path.join(dataset_dir, DATA_CONFIG), str(err))) + return {} + + +def list_dataset( + dataset_dir: Optional[str] = None, training_stage: Optional[str] = list(TRAINING_STAGES.keys())[0] +) -> Dict[str, Any]: + dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR) + ranking = TRAINING_STAGES[training_stage] in ["rm", "dpo"] + datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking] + return gr.update(value=[], choices=datasets) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/__init__.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..32228b8e7b00f5bc7cda374f4527a1a868e6b8a2 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/__init__.py @@ -0,0 +1,6 @@ +from llmtuner.webui.components.top import create_top +from llmtuner.webui.components.train import create_train_tab +from llmtuner.webui.components.eval import create_eval_tab +from llmtuner.webui.components.infer import create_infer_tab +from llmtuner.webui.components.export import create_export_tab +from llmtuner.webui.components.chatbot import create_chat_box diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/chatbot.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/chatbot.py new file mode 100644 index 0000000000000000000000000000000000000000..13e2dd4d7e7b8a98fa7a50a0efa9198caaed75c5 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/chatbot.py @@ -0,0 +1,49 @@ +import gradio as gr +from typing import TYPE_CHECKING, Dict, Optional, Tuple + +if TYPE_CHECKING: + from gradio.blocks import Block + from gradio.components import Component + from llmtuner.webui.engine import Engine + + +def create_chat_box( + engine: "Engine", + visible: Optional[bool] = False +) -> Tuple["Block", "Component", "Component", Dict[str, "Component"]]: + with gr.Box(visible=visible) as chat_box: + chatbot = gr.Chatbot() + history = gr.State([]) + with gr.Row(): + with gr.Column(scale=4): + system = gr.Textbox(show_label=False) + query = gr.Textbox(show_label=False, lines=8) + submit_btn = gr.Button(variant="primary") + + with gr.Column(scale=1): + clear_btn = gr.Button() + gen_kwargs = engine.chatter.generating_args + max_new_tokens = gr.Slider(10, 2048, value=gen_kwargs.max_new_tokens, step=1) + top_p = gr.Slider(0.01, 1, value=gen_kwargs.top_p, step=0.01) + temperature = gr.Slider(0.01, 1.5, value=gen_kwargs.temperature, step=0.01) + + submit_btn.click( + engine.chatter.predict, + [chatbot, query, history, system, max_new_tokens, top_p, temperature], + [chatbot, history], + show_progress=True + ).then( + lambda: gr.update(value=""), outputs=[query] + ) + + clear_btn.click(lambda: ([], []), outputs=[chatbot, history], show_progress=True) + + return chat_box, chatbot, history, dict( + system=system, + query=query, + submit_btn=submit_btn, + clear_btn=clear_btn, + max_new_tokens=max_new_tokens, + top_p=top_p, + temperature=temperature + ) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/data.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/data.py new file mode 100644 index 0000000000000000000000000000000000000000..effa39da1cf1686de1a912ebbaa31ee058247a0a --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/data.py @@ -0,0 +1,103 @@ +import os +import json +import gradio as gr +from typing import TYPE_CHECKING, Any, Dict, Tuple + +from llmtuner.webui.common import DATA_CONFIG + +if TYPE_CHECKING: + from gradio.components import Component + + +PAGE_SIZE = 2 + + +def prev_page(page_index: int) -> int: + return page_index - 1 if page_index > 0 else page_index + + +def next_page(page_index: int, total_num: int) -> int: + return page_index + 1 if (page_index + 1) * PAGE_SIZE < total_num else page_index + + +def can_preview(dataset_dir: str, dataset: list) -> Dict[str, Any]: + with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: + dataset_info = json.load(f) + + if ( + len(dataset) > 0 + and "file_name" in dataset_info[dataset[0]] + and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])) + ): + return gr.update(interactive=True) + else: + return gr.update(interactive=False) + + +def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, Dict[str, Any]]: + with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: + dataset_info = json.load(f) + + data_file: str = dataset_info[dataset[0]]["file_name"] + with open(os.path.join(dataset_dir, data_file), "r", encoding="utf-8") as f: + if data_file.endswith(".json"): + data = json.load(f) + elif data_file.endswith(".jsonl"): + data = [json.loads(line) for line in f] + else: + data = [line for line in f] + return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.update(visible=True) + + +def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]: + data_preview_btn = gr.Button(interactive=False, scale=1) + with gr.Column(visible=False, elem_classes="modal-box") as preview_box: + with gr.Row(): + preview_count = gr.Number(value=0, interactive=False, precision=0) + page_index = gr.Number(value=0, interactive=False, precision=0) + + with gr.Row(): + prev_btn = gr.Button() + next_btn = gr.Button() + close_btn = gr.Button() + + with gr.Row(): + preview_samples = gr.JSON(interactive=False) + + dataset.change( + can_preview, [dataset_dir, dataset], [data_preview_btn], queue=False + ).then( + lambda: 0, outputs=[page_index], queue=False + ) + data_preview_btn.click( + get_preview, + [dataset_dir, dataset, page_index], + [preview_count, preview_samples, preview_box], + queue=False + ) + prev_btn.click( + prev_page, [page_index], [page_index], queue=False + ).then( + get_preview, + [dataset_dir, dataset, page_index], + [preview_count, preview_samples, preview_box], + queue=False + ) + next_btn.click( + next_page, [page_index, preview_count], [page_index], queue=False + ).then( + get_preview, + [dataset_dir, dataset, page_index], + [preview_count, preview_samples, preview_box], + queue=False + ) + close_btn.click(lambda: gr.update(visible=False), outputs=[preview_box], queue=False) + return dict( + data_preview_btn=data_preview_btn, + preview_count=preview_count, + page_index=page_index, + prev_btn=prev_btn, + next_btn=next_btn, + close_btn=close_btn, + preview_samples=preview_samples + ) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/eval.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..0718c63e3b5c00e4b95e2abc403f4c0026d9a89f --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/eval.py @@ -0,0 +1,71 @@ +import gradio as gr +from typing import TYPE_CHECKING, Dict + +from llmtuner.webui.common import list_dataset, DEFAULT_DATA_DIR +from llmtuner.webui.components.data import create_preview_box + +if TYPE_CHECKING: + from gradio.components import Component + from llmtuner.webui.engine import Engine + + +def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]: + input_elems = engine.manager.get_base_elems() + elem_dict = dict() + + with gr.Row(): + dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2) + dataset = gr.Dropdown(multiselect=True, scale=4) + preview_elems = create_preview_box(dataset_dir, dataset) + + dataset_dir.change(list_dataset, [dataset_dir], [dataset], queue=False) + + input_elems.update({dataset_dir, dataset}) + elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems)) + + with gr.Row(): + cutoff_len = gr.Slider(value=1024, minimum=4, maximum=8192, step=1) + max_samples = gr.Textbox(value="100000") + batch_size = gr.Slider(value=8, minimum=1, maximum=512, step=1) + predict = gr.Checkbox(value=True) + + input_elems.update({cutoff_len, max_samples, batch_size, predict}) + elem_dict.update(dict( + cutoff_len=cutoff_len, max_samples=max_samples, batch_size=batch_size, predict=predict + )) + + with gr.Row(): + max_new_tokens = gr.Slider(10, 2048, value=128, step=1) + top_p = gr.Slider(0.01, 1, value=0.7, step=0.01) + temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01) + output_dir = gr.Textbox() + + input_elems.update({max_new_tokens, top_p, temperature, output_dir}) + elem_dict.update(dict( + max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature, output_dir=output_dir + )) + + with gr.Row(): + cmd_preview_btn = gr.Button() + start_btn = gr.Button() + stop_btn = gr.Button() + + with gr.Row(): + resume_btn = gr.Checkbox(visible=False, interactive=False, value=False) + process_bar = gr.Slider(visible=False, interactive=False) + + with gr.Box(): + output_box = gr.Markdown() + + output_elems = [output_box, process_bar] + elem_dict.update(dict( + cmd_preview_btn=cmd_preview_btn, start_btn=start_btn, stop_btn=stop_btn, + resume_btn=resume_btn, process_bar=process_bar, output_box=output_box + )) + + cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems) + start_btn.click(engine.runner.run_eval, input_elems, output_elems) + stop_btn.click(engine.runner.set_abort, queue=False) + resume_btn.change(engine.runner.monitor, outputs=output_elems) + + return elem_dict diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/export.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/export.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac6f3e67de9d6e7d5cde6e7a3270a32390f962e --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/export.py @@ -0,0 +1,80 @@ +import gradio as gr +from typing import TYPE_CHECKING, Dict, Generator, List + +from llmtuner.train import export_model +from llmtuner.webui.common import get_save_dir +from llmtuner.webui.locales import ALERTS + +if TYPE_CHECKING: + from gradio.components import Component + from llmtuner.webui.engine import Engine + + +def save_model( + lang: str, + model_name: str, + model_path: str, + checkpoints: List[str], + finetuning_type: str, + template: str, + max_shard_size: int, + export_dir: str +) -> Generator[str, None, None]: + error = "" + if not model_name: + error = ALERTS["err_no_model"][lang] + elif not model_path: + error = ALERTS["err_no_path"][lang] + elif not checkpoints: + error = ALERTS["err_no_checkpoint"][lang] + elif not export_dir: + error = ALERTS["err_no_export_dir"][lang] + + if error: + gr.Warning(error) + yield error + return + + args = dict( + model_name_or_path=model_path, + checkpoint_dir=",".join([get_save_dir(model_name, finetuning_type, ckpt) for ckpt in checkpoints]), + finetuning_type=finetuning_type, + template=template, + export_dir=export_dir, + export_size=max_shard_size + ) + + yield ALERTS["info_exporting"][lang] + export_model(args) + yield ALERTS["info_exported"][lang] + + +def create_export_tab(engine: "Engine") -> Dict[str, "Component"]: + with gr.Row(): + export_dir = gr.Textbox() + max_shard_size = gr.Slider(value=1, minimum=1, maximum=100) + + export_btn = gr.Button() + info_box = gr.Textbox(show_label=False, interactive=False) + + export_btn.click( + save_model, + [ + engine.manager.get_elem_by_name("top.lang"), + engine.manager.get_elem_by_name("top.model_name"), + engine.manager.get_elem_by_name("top.model_path"), + engine.manager.get_elem_by_name("top.checkpoints"), + engine.manager.get_elem_by_name("top.finetuning_type"), + engine.manager.get_elem_by_name("top.template"), + max_shard_size, + export_dir + ], + [info_box] + ) + + return dict( + export_dir=export_dir, + max_shard_size=max_shard_size, + export_btn=export_btn, + info_box=info_box + ) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/infer.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..d6dd7eed77b94f9b140b218276c15654b39c2f52 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/infer.py @@ -0,0 +1,39 @@ +import gradio as gr +from typing import TYPE_CHECKING, Dict + +from llmtuner.webui.components.chatbot import create_chat_box + +if TYPE_CHECKING: + from gradio.components import Component + from llmtuner.webui.engine import Engine + + +def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]: + input_elems = engine.manager.get_base_elems() + elem_dict = dict() + + with gr.Row(): + load_btn = gr.Button() + unload_btn = gr.Button() + + info_box = gr.Textbox(show_label=False, interactive=False) + elem_dict.update(dict(load_btn=load_btn, unload_btn=unload_btn, info_box=info_box)) + + chat_box, chatbot, history, chat_elems = create_chat_box(engine, visible=False) + elem_dict.update(dict(chat_box=chat_box, **chat_elems)) + + load_btn.click( + engine.chatter.load_model, input_elems, [info_box] + ).then( + lambda: gr.update(visible=engine.chatter.loaded), outputs=[chat_box] + ) + + unload_btn.click( + engine.chatter.unload_model, input_elems, [info_box] + ).then( + lambda: ([], []), outputs=[chatbot, history] + ).then( + lambda: gr.update(visible=engine.chatter.loaded), outputs=[chat_box] + ) + + return elem_dict diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/top.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/top.py new file mode 100644 index 0000000000000000000000000000000000000000..0cbd291a98fec8b76bc0c11d7ad4c3f4e1c7fec9 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/top.py @@ -0,0 +1,74 @@ +import gradio as gr +from typing import TYPE_CHECKING, Dict + +from llmtuner.data.template import templates +from llmtuner.extras.constants import METHODS, SUPPORTED_MODELS +from llmtuner.webui.common import get_model_path, get_template, list_checkpoint, save_config +from llmtuner.webui.utils import can_quantize + +if TYPE_CHECKING: + from gradio.components import Component + + +def create_top() -> Dict[str, "Component"]: + available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"] + + with gr.Row(): + lang = gr.Dropdown(choices=["en", "zh"], scale=1) + model_name = gr.Dropdown(choices=available_models, scale=3) + model_path = gr.Textbox(scale=3) + + with gr.Row(): + finetuning_type = gr.Dropdown(choices=METHODS, value="lora", scale=1) + checkpoints = gr.Dropdown(multiselect=True, scale=5) + refresh_btn = gr.Button(scale=1) + + with gr.Accordion(label="Advanced config", open=False) as advanced_tab: + with gr.Row(): + quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=1) + template = gr.Dropdown(choices=list(templates.keys()), value="default", scale=1) + system_prompt = gr.Textbox(scale=2) + + with gr.Accordion(label="Model config (LLaMA only)", open=False) as llama_tab: + with gr.Row(): + with gr.Column(): + flash_attn = gr.Checkbox(value=False) + shift_attn = gr.Checkbox(value=False) + rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none") + + model_name.change( + list_checkpoint, [model_name, finetuning_type], [checkpoints], queue=False + ).then( + get_model_path, [model_name], [model_path], queue=False + ).then( + get_template, [model_name], [template], queue=False + ) # do not save config since the below line will save + + model_path.change(save_config, inputs=[lang, model_name, model_path], queue=False) + + finetuning_type.change( + list_checkpoint, [model_name, finetuning_type], [checkpoints], queue=False + ).then( + can_quantize, [finetuning_type], [quantization_bit], queue=False + ) + + refresh_btn.click( + list_checkpoint, [model_name, finetuning_type], [checkpoints], queue=False + ) + + return dict( + lang=lang, + model_name=model_name, + model_path=model_path, + finetuning_type=finetuning_type, + checkpoints=checkpoints, + refresh_btn=refresh_btn, + advanced_tab=advanced_tab, + quantization_bit=quantization_bit, + template=template, + system_prompt=system_prompt, + llama_tab=llama_tab, + flash_attn=flash_attn, + shift_attn=shift_attn, + rope_scaling=rope_scaling + ) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/components/train.py b/LLM-Detector-V4-11w/src/llmtuner/webui/components/train.py new file mode 100644 index 0000000000000000000000000000000000000000..11109c97c42603762db2294679c4269dbf18db5e --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/components/train.py @@ -0,0 +1,154 @@ +import gradio as gr +from typing import TYPE_CHECKING, Dict +from transformers.trainer_utils import SchedulerType + +from llmtuner.extras.constants import TRAINING_STAGES +from llmtuner.webui.common import list_checkpoint, list_dataset, DEFAULT_DATA_DIR +from llmtuner.webui.components.data import create_preview_box +from llmtuner.webui.utils import gen_plot + +if TYPE_CHECKING: + from gradio.components import Component + from llmtuner.webui.engine import Engine + + +def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: + input_elems = engine.manager.get_base_elems() + elem_dict = dict() + + with gr.Row(): + training_stage = gr.Dropdown( + choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=2 + ) + dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2) + dataset = gr.Dropdown(multiselect=True, scale=4) + preview_elems = create_preview_box(dataset_dir, dataset) + + training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False) + dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False) + + input_elems.update({training_stage, dataset_dir, dataset}) + elem_dict.update(dict( + training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems + )) + + with gr.Row(): + cutoff_len = gr.Slider(value=1024, minimum=4, maximum=8192, step=1) + learning_rate = gr.Textbox(value="5e-5") + num_train_epochs = gr.Textbox(value="3.0") + max_samples = gr.Textbox(value="100000") + compute_type = gr.Radio(choices=["fp16", "bf16"], value="fp16") + + input_elems.update({cutoff_len, learning_rate, num_train_epochs, max_samples, compute_type}) + elem_dict.update(dict( + cutoff_len=cutoff_len, learning_rate=learning_rate, num_train_epochs=num_train_epochs, + max_samples=max_samples, compute_type=compute_type + )) + + with gr.Row(): + batch_size = gr.Slider(value=4, minimum=1, maximum=512, step=1) + gradient_accumulation_steps = gr.Slider(value=4, minimum=1, maximum=512, step=1) + lr_scheduler_type = gr.Dropdown( + choices=[scheduler.value for scheduler in SchedulerType], value="cosine" + ) + max_grad_norm = gr.Textbox(value="1.0") + val_size = gr.Slider(value=0, minimum=0, maximum=1, step=0.001) + + input_elems.update({batch_size, gradient_accumulation_steps, lr_scheduler_type, max_grad_norm, val_size}) + elem_dict.update(dict( + batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, + lr_scheduler_type=lr_scheduler_type, max_grad_norm=max_grad_norm, val_size=val_size + )) + + with gr.Accordion(label="Advanced config", open=False) as advanced_tab: + with gr.Row(): + logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5) + save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10) + warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1) + neft_alpha = gr.Slider(value=0, minimum=0, maximum=10, step=0.1) + + with gr.Column(): + train_on_prompt = gr.Checkbox(value=False) + upcast_layernorm = gr.Checkbox(value=False) + + input_elems.update({logging_steps, save_steps, warmup_steps, neft_alpha, train_on_prompt, upcast_layernorm}) + elem_dict.update(dict( + advanced_tab=advanced_tab, logging_steps=logging_steps, save_steps=save_steps, warmup_steps=warmup_steps, + neft_alpha=neft_alpha, train_on_prompt=train_on_prompt, upcast_layernorm=upcast_layernorm + )) + + with gr.Accordion(label="LoRA config", open=False) as lora_tab: + with gr.Row(): + lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1, scale=1) + lora_dropout = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1) + lora_target = gr.Textbox(scale=1) + additional_target = gr.Textbox(scale=1) + resume_lora_training = gr.Checkbox(value=True, scale=1) + + input_elems.update({lora_rank, lora_dropout, lora_target, additional_target, resume_lora_training}) + elem_dict.update(dict( + lora_tab=lora_tab, lora_rank=lora_rank, lora_dropout=lora_dropout, lora_target=lora_target, + additional_target=additional_target, resume_lora_training=resume_lora_training, + )) + + with gr.Accordion(label="RLHF config", open=False) as rlhf_tab: + with gr.Row(): + dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1) + reward_model = gr.Dropdown(scale=3) + refresh_btn = gr.Button(scale=1) + + refresh_btn.click( + list_checkpoint, + [engine.manager.get_elem_by_name("top.model_name"), engine.manager.get_elem_by_name("top.finetuning_type")], + [reward_model], + queue=False + ) + + input_elems.update({dpo_beta, reward_model}) + elem_dict.update(dict(rlhf_tab=rlhf_tab, dpo_beta=dpo_beta, reward_model=reward_model, refresh_btn=refresh_btn)) + + with gr.Row(): + cmd_preview_btn = gr.Button() + start_btn = gr.Button() + stop_btn = gr.Button() + + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + output_dir = gr.Textbox() + + with gr.Row(): + resume_btn = gr.Checkbox(visible=False, interactive=False, value=False) + process_bar = gr.Slider(visible=False, interactive=False) + + with gr.Box(): + output_box = gr.Markdown() + + with gr.Column(scale=1): + loss_viewer = gr.Plot() + + input_elems.add(output_dir) + output_elems = [output_box, process_bar] + + cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems) + start_btn.click(engine.runner.run_train, input_elems, output_elems) + stop_btn.click(engine.runner.set_abort, queue=False) + resume_btn.change(engine.runner.monitor, outputs=output_elems) + + elem_dict.update(dict( + cmd_preview_btn=cmd_preview_btn, start_btn=start_btn, stop_btn=stop_btn, output_dir=output_dir, + resume_btn=resume_btn, process_bar=process_bar, output_box=output_box, loss_viewer=loss_viewer + )) + + output_box.change( + gen_plot, + [ + engine.manager.get_elem_by_name("top.model_name"), + engine.manager.get_elem_by_name("top.finetuning_type"), + output_dir + ], + loss_viewer, + queue=False + ) + + return elem_dict diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/css.py b/LLM-Detector-V4-11w/src/llmtuner/webui/css.py new file mode 100644 index 0000000000000000000000000000000000000000..36e3d4c2867c2791ecf0ce70b57b42b84e532f08 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/css.py @@ -0,0 +1,27 @@ +CSS = r""" +.duplicate-button { + margin: auto !important; + color: white !important; + background: black !important; + border-radius: 100vh !important; +} + +.modal-box { + position: fixed !important; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); /* center horizontally */ + max-width: 1000px; + max-height: 750px; + overflow-y: auto; + background-color: var(--input-background-fill); + flex-wrap: nowrap !important; + border: 2px solid black !important; + z-index: 1000; + padding: 10px; +} + +.dark .modal-box { + border: 2px solid white !important; +} +""" diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/engine.py b/LLM-Detector-V4-11w/src/llmtuner/webui/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..991b281c8df64c1a5c1a14928dfb433fb2af86ff --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/engine.py @@ -0,0 +1,61 @@ +import gradio as gr +from gradio.components import Component # cannot use TYPE_CHECKING here +from typing import Any, Dict, Generator, Optional + +from llmtuner.webui.chatter import WebChatModel +from llmtuner.webui.common import get_model_path, list_dataset, load_config +from llmtuner.webui.locales import LOCALES +from llmtuner.webui.manager import Manager +from llmtuner.webui.runner import Runner +from llmtuner.webui.utils import get_time + + +class Engine: + + def __init__(self, demo_mode: Optional[bool] = False, pure_chat: Optional[bool] = False) -> None: + self.demo_mode = demo_mode + self.pure_chat = pure_chat + self.manager = Manager() + self.runner = Runner(self.manager, demo_mode=demo_mode) + self.chatter = WebChatModel(manager=self.manager, demo_mode=demo_mode, lazy_init=(not pure_chat)) + + def _form_dict(self, resume_dict: Dict[str, Dict[str, Any]]): + return {self.manager.get_elem_by_name(k): gr.update(**v) for k, v in resume_dict.items()} + + def resume(self) -> Generator[Dict[Component, Dict[str, Any]], None, None]: + user_config = load_config() if not self.demo_mode else {} + lang = user_config.get("lang", None) or "en" + + init_dict = { + "top.lang": {"value": lang}, + "infer.chat_box": {"visible": self.chatter.loaded} + } + + if not self.pure_chat: + init_dict["train.dataset"] = {"choices": list_dataset()["choices"]} + init_dict["eval.dataset"] = {"choices": list_dataset()["choices"]} + + if user_config.get("last_model", None): + init_dict["top.model_name"] = {"value": user_config["last_model"]} + init_dict["top.model_path"] = {"value": get_model_path(user_config["last_model"])} + + yield self._form_dict(init_dict) + + if not self.pure_chat: + if self.runner.alive: + yield {elem: gr.update(value=value) for elem, value in self.runner.running_data.items()} + if self.runner.do_train: + yield self._form_dict({"train.resume_btn": {"value": True}}) + else: + yield self._form_dict({"eval.resume_btn": {"value": True}}) + else: + yield self._form_dict({ + "train.output_dir": {"value": "train_" + get_time()}, + "eval.output_dir": {"value": "eval_" + get_time()}, + }) + + def change_lang(self, lang: str) -> Dict[Component, Dict[str, Any]]: + return { + component: gr.update(**LOCALES[name][lang]) + for elems in self.manager.all_elems.values() for name, component in elems.items() if name in LOCALES + } diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/interface.py b/LLM-Detector-V4-11w/src/llmtuner/webui/interface.py new file mode 100644 index 0000000000000000000000000000000000000000..74ac59a01ef0d7118a14139d059b77e079141ba0 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/interface.py @@ -0,0 +1,78 @@ +import gradio as gr +from typing import Optional +from transformers.utils.versions import require_version + +from llmtuner.webui.components import ( + create_top, + create_train_tab, + create_eval_tab, + create_infer_tab, + create_export_tab, + create_chat_box +) +from llmtuner.webui.common import save_config +from llmtuner.webui.css import CSS +from llmtuner.webui.engine import Engine + + +require_version("gradio>=3.38.0,<4.0.0", "To fix: pip install \"gradio>=3.38.0,<4.0.0\"") + + +def create_ui(demo_mode: Optional[bool] = False) -> gr.Blocks: + engine = Engine(demo_mode=demo_mode, pure_chat=False) + + with gr.Blocks(title="LLaMA Board", css=CSS) as demo: + if demo_mode: + gr.HTML( + "

LLaMA Board: A One-stop Web UI for Getting Started with LLaMA Factory

" + ) + gr.HTML( + "

Visit " + "LLaMA Factory for details.

" + ) + gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") + + engine.manager.all_elems["top"] = create_top() + lang: "gr.Dropdown" = engine.manager.get_elem_by_name("top.lang") + + with gr.Tab("Train"): + engine.manager.all_elems["train"] = create_train_tab(engine) + + with gr.Tab("Evaluate & Predict"): + engine.manager.all_elems["eval"] = create_eval_tab(engine) + + with gr.Tab("Chat"): + engine.manager.all_elems["infer"] = create_infer_tab(engine) + + if not demo_mode: + with gr.Tab("Export"): + engine.manager.all_elems["export"] = create_export_tab(engine) + + demo.load(engine.resume, outputs=engine.manager.list_elems()) + lang.change(engine.change_lang, [lang], engine.manager.list_elems(), queue=False) + lang.input(save_config, inputs=[lang], queue=False) + + return demo + + +def create_web_demo() -> gr.Blocks: + engine = Engine(pure_chat=True) + + with gr.Blocks(title="Web Demo", css=CSS) as demo: + lang = gr.Dropdown(choices=["en", "zh"]) + engine.manager.all_elems["top"] = dict(lang=lang) + + chat_box, _, _, chat_elems = create_chat_box(engine, visible=True) + engine.manager.all_elems["infer"] = dict(chat_box=chat_box, **chat_elems) + + demo.load(engine.resume, outputs=engine.manager.list_elems()) + lang.change(engine.change_lang, [lang], engine.manager.list_elems(), queue=False) + lang.input(save_config, inputs=[lang], queue=False) + + return demo + + +if __name__ == "__main__": + demo = create_ui() + demo.queue() + demo.launch(server_name="0.0.0.0", server_port=7860, share=False, inbrowser=True) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/locales.py b/LLM-Detector-V4-11w/src/llmtuner/webui/locales.py new file mode 100644 index 0000000000000000000000000000000000000000..5f5609d8278832768ca4f3c5ceb8cc7116204228 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/locales.py @@ -0,0 +1,702 @@ +LOCALES = { + "lang": { + "en": { + "label": "Lang" + }, + "zh": { + "label": "语言" + } + }, + "model_name": { + "en": { + "label": "Model name" + }, + "zh": { + "label": "模型名称" + } + }, + "model_path": { + "en": { + "label": "Model path", + "info": "Path to pretrained model or model identifier from Hugging Face." + }, + "zh": { + "label": "模型路径", + "info": "本地模型的文件路径或 Hugging Face 的模型标识符。" + } + }, + "finetuning_type": { + "en": { + "label": "Finetuning method" + }, + "zh": { + "label": "微调方法" + } + }, + "checkpoints": { + "en": { + "label": "Checkpoints" + }, + "zh": { + "label": "模型断点" + } + }, + "refresh_btn": { + "en": { + "value": "Refresh checkpoints" + }, + "zh": { + "value": "刷新断点" + } + }, + "advanced_tab": { + "en": { + "label": "Advanced configurations" + }, + "zh": { + "label": "高级设置" + } + }, + "quantization_bit": { + "en": { + "label": "Quantization bit", + "info": "Enable 4/8-bit model quantization (QLoRA)." + }, + "zh": { + "label": "量化等级", + "info": "启用 4/8 比特模型量化(QLoRA)。" + } + }, + "template": { + "en": { + "label": "Prompt template", + "info": "The template used in constructing prompts." + }, + "zh": { + "label": "提示模板", + "info": "构建提示词时使用的模板" + } + }, + "system_prompt": { + "en": { + "label": "System prompt (optional)", + "info": "A sequence used as the default system prompt." + }, + "zh": { + "label": "系统提示词(非必填)", + "info": "默认使用的系统提示词" + } + }, + "llama_tab": { + "en": { + "label": "Model configurations (LLaMA only)" + }, + "zh": { + "label": "模型设置(仅LLaMA)" + } + }, + "flash_attn": { + "en": { + "label": "Use FlashAttention-2" + }, + "zh": { + "label": "使用 FlashAttention-2" + } + }, + "shift_attn": { + "en": { + "label": "Use shift short attention (S^2-Attn)" + }, + "zh": { + "label": "使用 shift short attention (S^2-Attn)" + } + }, + "rope_scaling": { + "en": { + "label": "RoPE scaling" + }, + "zh": { + "label": "RoPE 插值方法" + } + }, + "training_stage": { + "en": { + "label": "Stage", + "info": "The stage to perform in training." + }, + "zh": { + "label": "训练阶段", + "info": "目前采用的训练方式。" + } + }, + "dataset_dir": { + "en": { + "label": "Data dir", + "info": "Path to the data directory." + }, + "zh": { + "label": "数据路径", + "info": "数据文件夹的路径。" + } + }, + "dataset": { + "en": { + "label": "Dataset" + }, + "zh": { + "label": "数据集" + } + }, + "data_preview_btn": { + "en": { + "value": "Preview dataset" + }, + "zh": { + "value": "预览数据集" + } + }, + "preview_count": { + "en": { + "label": "Count" + }, + "zh": { + "label": "数量" + } + }, + "page_index": { + "en": { + "label": "Page" + }, + "zh": { + "label": "页数" + } + }, + "prev_btn": { + "en": { + "value": "Prev" + }, + "zh": { + "value": "上一页" + } + }, + "next_btn": { + "en": { + "value": "Next" + }, + "zh": { + "value": "下一页" + } + }, + "close_btn": { + "en": { + "value": "Close" + }, + "zh": { + "value": "关闭" + } + }, + "preview_samples": { + "en": { + "label": "Samples" + }, + "zh": { + "label": "样例" + } + }, + "cutoff_len": { + "en": { + "label": "Cutoff length", + "info": "Max tokens in input sequence." + }, + "zh": { + "label": "截断长度", + "info": "输入序列分词后的最大长度。" + } + }, + "learning_rate": { + "en": { + "label": "Learning rate", + "info": "Initial learning rate for AdamW." + }, + "zh": { + "label": "学习率", + "info": "AdamW 优化器的初始学习率。" + } + }, + "num_train_epochs": { + "en": { + "label": "Epochs", + "info": "Total number of training epochs to perform." + }, + "zh": { + "label": "训练轮数", + "info": "需要执行的训练总轮数。" + } + }, + "max_samples": { + "en": { + "label": "Max samples", + "info": "Maximum samples per dataset." + }, + "zh": { + "label": "最大样本数", + "info": "每个数据集最多使用的样本数。" + } + }, + "compute_type": { + "en": { + "label": "Compute type", + "info": "Whether to use fp16 or bf16 mixed precision training." + }, + "zh": { + "label": "计算类型", + "info": "是否启用 FP16 或 BF16 混合精度训练。" + } + }, + "batch_size": { + "en": { + "label": "Batch size", + "info": "Number of samples to process per GPU." + }, + "zh":{ + "label": "批处理大小", + "info": "每块 GPU 上处理的样本数量。" + } + }, + "gradient_accumulation_steps": { + "en": { + "label": "Gradient accumulation", + "info": "Number of gradient accumulation steps." + }, + "zh": { + "label": "梯度累积", + "info": "梯度累积的步数。" + } + }, + "lr_scheduler_type": { + "en": { + "label": "LR Scheduler", + "info": "Name of learning rate scheduler.", + }, + "zh": { + "label": "学习率调节器", + "info": "采用的学习率调节器名称。" + } + }, + "max_grad_norm": { + "en": { + "label": "Maximum gradient norm", + "info": "Norm for gradient clipping.." + }, + "zh": { + "label": "最大梯度范数", + "info": "用于梯度裁剪的范数。" + } + }, + "val_size": { + "en": { + "label": "Val size", + "info": "Proportion of data in the dev set." + }, + "zh": { + "label": "验证集比例", + "info": "验证集占全部样本的百分比。" + } + }, + "logging_steps": { + "en": { + "label": "Logging steps", + "info": "Number of steps between two logs." + }, + "zh": { + "label": "日志间隔", + "info": "每两次日志输出间的更新步数。" + } + }, + "save_steps": { + "en": { + "label": "Save steps", + "info": "Number of steps between two checkpoints." + }, + "zh": { + "label": "保存间隔", + "info": "每两次断点保存间的更新步数。" + } + }, + "warmup_steps": { + "en": { + "label": "Warmup steps", + "info": "Number of steps used for warmup." + }, + "zh": { + "label": "预热步数", + "info": "学习率预热采用的步数。" + } + }, + "neft_alpha": { + "en": { + "label": "NEFTune Alpha", + "info": "Magnitude of noise adding to embedding vectors." + }, + "zh": { + "label": "NEFTune 噪声参数", + "info": "嵌入向量所添加的噪声大小。" + } + }, + "train_on_prompt": { + "en": { + "label": "Train on prompt", + "info": "Compute loss on the prompt tokens in supervised fine-tuning." + }, + "zh": { + "label": "计算输入损失", + "info": "在监督微调时候计算输入序列的损失。" + } + }, + "upcast_layernorm": { + "en": { + "label": "Upcast LayerNorm", + "info": "Upcast weights of layernorm in float32." + }, + "zh": { + "label": "缩放归一化层", + "info": "将归一化层权重缩放至 32 位浮点数。" + } + }, + "lora_tab": { + "en": { + "label": "LoRA configurations" + }, + "zh": { + "label": "LoRA 参数设置" + } + }, + "lora_rank": { + "en": { + "label": "LoRA rank", + "info": "The rank of LoRA matrices." + }, + "zh": { + "label": "LoRA 秩", + "info": "LoRA 矩阵的秩。" + } + }, + "lora_dropout": { + "en": { + "label": "LoRA Dropout", + "info": "Dropout ratio of LoRA weights." + }, + "zh": { + "label": "LoRA 随机丢弃", + "info": "LoRA 权重随机丢弃的概率。" + } + }, + "lora_target": { + "en": { + "label": "LoRA modules (optional)", + "info": "Name(s) of target modules to apply LoRA. Use commas to separate multiple modules." + }, + "zh": { + "label": "LoRA 作用模块(非必填)", + "info": "应用 LoRA 的目标模块名称。使用英文逗号分隔多个名称。" + } + }, + "additional_target": { + "en": { + "label": "Additional modules (optional)", + "info": "Name(s) of modules apart from LoRA layers to be set as trainable. Use commas to separate multiple modules." + }, + "zh": { + "label": "附加模块(非必填)", + "info": "除 LoRA 层以外的可训练模块名称。使用英文逗号分隔多个名称。" + } + }, + "resume_lora_training": { + "en": { + "label": "Resume LoRA training", + "info": "Whether to resume training from the last LoRA weights or create new lora weights." + }, + "zh": { + "label": "继续上次的训练", + "info": "接着上次的 LoRA 权重训练或创建一个新的 LoRA 权重。" + } + }, + "rlhf_tab": { + "en": { + "label": "RLHF configurations" + }, + "zh": { + "label": "RLHF 参数设置" + } + }, + "dpo_beta": { + "en": { + "label": "DPO beta", + "info": "Value of the beta parameter in the DPO loss." + }, + "zh": { + "label": "DPO beta 参数", + "info": "DPO 损失函数中 beta 超参数大小。" + } + }, + "reward_model": { + "en": { + "label": "Reward model", + "info": "Checkpoint of the reward model for PPO training. (Needs to refresh checkpoints)" + }, + "zh": { + "label": "奖励模型", + "info": "PPO 训练中奖励模型的断点路径。(需要刷新断点)" + } + }, + "cmd_preview_btn": { + "en": { + "value": "Preview command" + }, + "zh": { + "value": "预览命令" + } + }, + "start_btn": { + "en": { + "value": "Start" + }, + "zh": { + "value": "开始" + } + }, + "stop_btn": { + "en": { + "value": "Abort" + }, + "zh": { + "value": "中断" + } + }, + "output_dir": { + "en": { + "label": "Output dir", + "info": "Directory for saving results." + }, + "zh": { + "label": "输出目录", + "info": "保存结果的路径。" + } + }, + "output_box": { + "en": { + "value": "Ready." + }, + "zh": { + "value": "准备就绪。" + } + }, + "loss_viewer": { + "en": { + "label": "Loss" + }, + "zh": { + "label": "损失" + } + }, + "predict": { + "en": { + "label": "Save predictions" + }, + "zh": { + "label": "保存预测结果" + } + }, + "load_btn": { + "en": { + "value": "Load model" + }, + "zh": { + "value": "加载模型" + } + }, + "unload_btn": { + "en": { + "value": "Unload model" + }, + "zh": { + "value": "卸载模型" + } + }, + "info_box": { + "en": { + "value": "Model unloaded, please load a model first." + }, + "zh": { + "value": "模型未加载,请先加载模型。" + } + }, + "system": { + "en": { + "placeholder": "System prompt (optional)" + }, + "zh": { + "placeholder": "系统提示词(非必填)" + } + }, + "query": { + "en": { + "placeholder": "Input..." + }, + "zh": { + "placeholder": "输入..." + } + }, + "submit_btn": { + "en": { + "value": "Submit" + }, + "zh": { + "value": "提交" + } + }, + "clear_btn": { + "en": { + "value": "Clear history" + }, + "zh": { + "value": "清空历史" + } + }, + "max_length": { + "en": { + "label": "Maximum length" + }, + "zh": { + "label": "最大长度" + } + }, + "max_new_tokens": { + "en": { + "label": "Maximum new tokens" + }, + "zh": { + "label": "最大生成长度" + } + }, + "top_p": { + "en": { + "label": "Top-p" + }, + "zh": { + "label": "Top-p 采样值" + } + }, + "temperature": { + "en": { + "label": "Temperature" + }, + "zh": { + "label": "温度系数" + } + }, + "export_dir": { + "en": { + "label": "Export dir", + "info": "Directory to save exported model." + }, + "zh": { + "label": "导出目录", + "info": "保存导出模型的文件夹路径。" + } + }, + "max_shard_size": { + "en": { + "label": "Max shard size (GB)", + "info": "The maximum size for a model file." + }, + "zh": { + "label": "最大分块大小(GB)", + "info": "模型文件的最大大小。" + } + }, + "export_btn": { + "en": { + "value": "Export" + }, + "zh": { + "value": "开始导出" + } + } +} + + +ALERTS = { + "err_conflict": { + "en": "A process is in running, please abort it firstly.", + "zh": "任务已存在,请先中断训练。" + }, + "err_exists": { + "en": "You have loaded a model, please unload it first.", + "zh": "模型已存在,请先卸载模型。" + }, + "err_no_model": { + "en": "Please select a model.", + "zh": "请选择模型。" + }, + "err_no_path": { + "en": "Model not found.", + "zh": "模型未找到。" + }, + "err_no_dataset": { + "en": "Please choose a dataset.", + "zh": "请选择数据集。" + }, + "err_no_checkpoint": { + "en": "Please select a checkpoint.", + "zh": "请选择断点。" + }, + "err_no_export_dir": { + "en": "Please provide export dir.", + "zh": "请填写导出目录" + }, + "err_failed": { + "en": "Failed.", + "zh": "训练出错。" + }, + "err_demo": { + "en": "Training is unavailable in demo mode, duplicate the space to a private one first.", + "zh": "展示模式不支持训练,请先复制到私人空间。" + }, + "info_aborting": { + "en": "Aborted, wait for terminating...", + "zh": "训练中断,正在等待线程结束……" + }, + "info_aborted": { + "en": "Ready.", + "zh": "准备就绪。" + }, + "info_finished": { + "en": "Finished.", + "zh": "训练完毕。" + }, + "info_loading": { + "en": "Loading model...", + "zh": "加载中……" + }, + "info_unloading": { + "en": "Unloading model...", + "zh": "卸载中……" + }, + "info_loaded": { + "en": "Model loaded, now you can chat with your model!", + "zh": "模型已加载,可以开始聊天了!" + }, + "info_unloaded": { + "en": "Model unloaded.", + "zh": "模型已卸载。" + }, + "info_exporting": { + "en": "Exporting model...", + "zh": "正在导出模型……" + }, + "info_exported": { + "en": "Model exported.", + "zh": "模型导出完成。" + } +} diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/manager.py b/LLM-Detector-V4-11w/src/llmtuner/webui/manager.py new file mode 100644 index 0000000000000000000000000000000000000000..ca067aea26d23220b06bba182a6b71f00f36cbce --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/manager.py @@ -0,0 +1,35 @@ +from typing import TYPE_CHECKING, Dict, List, Set + +if TYPE_CHECKING: + from gradio.components import Component + + +class Manager: + + def __init__(self) -> None: + self.all_elems: Dict[str, Dict[str, "Component"]] = {} + + def get_elem_by_name(self, name: str) -> "Component": + r""" + Example: top.lang, train.dataset + """ + tab_name, elem_name = name.split(".") + return self.all_elems[tab_name][elem_name] + + def get_base_elems(self) -> Set["Component"]: + return { + self.all_elems["top"]["lang"], + self.all_elems["top"]["model_name"], + self.all_elems["top"]["model_path"], + self.all_elems["top"]["checkpoints"], + self.all_elems["top"]["finetuning_type"], + self.all_elems["top"]["quantization_bit"], + self.all_elems["top"]["template"], + self.all_elems["top"]["system_prompt"], + self.all_elems["top"]["flash_attn"], + self.all_elems["top"]["shift_attn"], + self.all_elems["top"]["rope_scaling"] + } + + def list_elems(self) -> List["Component"]: + return [elem for elems in self.all_elems.values() for elem in elems.values()] diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/runner.py b/LLM-Detector-V4-11w/src/llmtuner/webui/runner.py new file mode 100644 index 0000000000000000000000000000000000000000..664f3354ccfacfc3ce993683546ca402aa73b72d --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/runner.py @@ -0,0 +1,260 @@ +import os +import time +import logging +import gradio as gr +from threading import Thread +from gradio.components import Component # cannot use TYPE_CHECKING here +from typing import TYPE_CHECKING, Any, Dict, Generator, Optional, Tuple + +import transformers +from transformers.trainer import TRAINING_ARGS_NAME + +from llmtuner.extras.callbacks import LogCallback +from llmtuner.extras.constants import TRAINING_STAGES +from llmtuner.extras.logging import LoggerHandler +from llmtuner.extras.misc import torch_gc +from llmtuner.train import run_exp +from llmtuner.webui.common import get_module, get_save_dir, load_config +from llmtuner.webui.locales import ALERTS +from llmtuner.webui.utils import gen_cmd, get_eval_results, update_process_bar + +if TYPE_CHECKING: + from llmtuner.webui.manager import Manager + + +class Runner: + + def __init__(self, manager: "Manager", demo_mode: Optional[bool] = False) -> None: + self.manager = manager + self.demo_mode = demo_mode + """ Resume """ + self.thread: "Thread" = None + self.do_train = True + self.running_data: Dict["Component", Any] = None + """ State """ + self.aborted = False + self.running = False + """ Handler """ + self.logger_handler = LoggerHandler() + self.logger_handler.setLevel(logging.INFO) + logging.root.addHandler(self.logger_handler) + transformers.logging.add_handler(self.logger_handler) + + @property + def alive(self) -> bool: + return self.thread is not None + + def set_abort(self) -> None: + self.aborted = True + + def _initialize(self, data: Dict[Component, Any], do_train: bool, from_preview: bool) -> str: + get = lambda name: data[self.manager.get_elem_by_name(name)] + lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path") + dataset = get("train.dataset") if do_train else get("eval.dataset") + + if self.running: + return ALERTS["err_conflict"][lang] + + if not model_name: + return ALERTS["err_no_model"][lang] + + if not model_path: + return ALERTS["err_no_path"][lang] + + if len(dataset) == 0: + return ALERTS["err_no_dataset"][lang] + + if self.demo_mode and (not from_preview): + return ALERTS["err_demo"][lang] + + self.aborted = False + self.logger_handler.reset() + self.trainer_callback = LogCallback(self) + return "" + + def _finalize(self, lang: str, finish_info: str) -> str: + self.thread = None + self.running_data = None + self.running = False + torch_gc() + if self.aborted: + return ALERTS["info_aborted"][lang] + else: + return finish_info + + def _parse_train_args(self, data: Dict[Component, Any]) -> Dict[str, Any]: + get = lambda name: data[self.manager.get_elem_by_name(name)] + user_config = load_config() + + if get("top.checkpoints"): + checkpoint_dir = ",".join([ + get_save_dir(get("top.model_name"), get("top.finetuning_type"), ckpt) for ckpt in get("top.checkpoints") + ]) + else: + checkpoint_dir = None + + args = dict( + stage=TRAINING_STAGES[get("train.training_stage")], + model_name_or_path=get("top.model_path"), + do_train=True, + cache_dir=user_config.get("cache_dir", None), + checkpoint_dir=checkpoint_dir, + finetuning_type=get("top.finetuning_type"), + quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + template=get("top.template"), + system_prompt=get("top.system_prompt"), + flash_attn=get("top.flash_attn"), + shift_attn=get("top.shift_attn"), + rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, + dataset_dir=get("train.dataset_dir"), + dataset=",".join(get("train.dataset")), + cutoff_len=get("train.cutoff_len"), + learning_rate=float(get("train.learning_rate")), + num_train_epochs=float(get("train.num_train_epochs")), + max_samples=int(get("train.max_samples")), + per_device_train_batch_size=get("train.batch_size"), + gradient_accumulation_steps=get("train.gradient_accumulation_steps"), + lr_scheduler_type=get("train.lr_scheduler_type"), + max_grad_norm=float(get("train.max_grad_norm")), + logging_steps=get("train.logging_steps"), + save_steps=get("train.save_steps"), + warmup_steps=get("train.warmup_steps"), + neft_alpha=get("train.neft_alpha"), + train_on_prompt=get("train.train_on_prompt"), + upcast_layernorm=get("train.upcast_layernorm"), + lora_rank=get("train.lora_rank"), + lora_dropout=get("train.lora_dropout"), + lora_target=get("train.lora_target") or get_module(get("top.model_name")), + additional_target=get("train.additional_target") if get("train.additional_target") else None, + resume_lora_training=get("train.resume_lora_training"), + output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")) + ) + args[get("train.compute_type")] = True + args["disable_tqdm"] = True + + if TRAINING_STAGES[get("train.training_stage")] in ["rm", "ppo", "dpo"]: + args["resume_lora_training"] = (args["quantization_bit"] is not None) + + if args["quantization_bit"] is not None: + args["upcast_layernorm"] = True + + if args["stage"] == "ppo": + args["reward_model"] = get_save_dir( + get("top.model_name"), get("top.finetuning_type"), get("train.reward_model") + ) + args["reward_model_type"] = "lora" if get("top.finetuning_type") == "lora" else "full" + + if args["stage"] == "dpo": + args["dpo_beta"] = get("train.dpo_beta") + + if get("train.val_size") > 1e-6 and args["stage"] != "ppo": + args["val_size"] = get("train.val_size") + args["evaluation_strategy"] = "steps" + args["eval_steps"] = get("train.save_steps") + args["load_best_model_at_end"] = True + + return args + + def _parse_eval_args(self, data: Dict[Component, Any]) -> Dict[str, Any]: + get = lambda name: data[self.manager.get_elem_by_name(name)] + user_config = load_config() + + if get("top.checkpoints"): + checkpoint_dir = ",".join([ + get_save_dir(get("top.model_name"), get("top.finetuning_type"), ckpt) for ckpt in get("top.checkpoints") + ]) + else: + checkpoint_dir = None + + args = dict( + stage="sft", + model_name_or_path=get("top.model_path"), + do_eval=True, + predict_with_generate=True, + cache_dir=user_config.get("cache_dir", None), + checkpoint_dir=checkpoint_dir, + finetuning_type=get("top.finetuning_type"), + quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + template=get("top.template"), + system_prompt=get("top.system_prompt"), + flash_attn=get("top.flash_attn"), + shift_attn=get("top.shift_attn"), + rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, + dataset_dir=get("eval.dataset_dir"), + dataset=",".join(get("eval.dataset")), + cutoff_len=get("eval.cutoff_len"), + max_samples=int(get("eval.max_samples")), + per_device_eval_batch_size=get("eval.batch_size"), + max_new_tokens=get("eval.max_new_tokens"), + top_p=get("eval.top_p"), + temperature=get("eval.temperature"), + output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("eval.output_dir")) + ) + + if get("eval.predict"): + args.pop("do_eval", None) + args["do_predict"] = True + + return args + + def _preview(self, data: Dict[Component, Any], do_train: bool) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + error = self._initialize(data, do_train, from_preview=True) + if error: + gr.Warning(error) + yield error, gr.update(visible=False) + else: + args = self._parse_train_args(data) if do_train else self._parse_eval_args(data) + yield gen_cmd(args), gr.update(visible=False) + + def _launch(self, data: Dict[Component, Any], do_train: bool) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + error = self._initialize(data, do_train, from_preview=False) + if error: + gr.Warning(error) + yield error, gr.update(visible=False) + else: + args = self._parse_train_args(data) if do_train else self._parse_eval_args(data) + run_kwargs = dict(args=args, callbacks=[self.trainer_callback]) + self.do_train, self.running_data = do_train, data + self.thread = Thread(target=run_exp, kwargs=run_kwargs) + self.thread.start() + yield from self.monitor() + + def preview_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + yield from self._preview(data, do_train=True) + + def preview_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + yield from self._preview(data, do_train=False) + + def run_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + yield from self._launch(data, do_train=True) + + def run_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + yield from self._launch(data, do_train=False) + + def monitor(self) -> Generator[Tuple[str, Dict[str, Any]], None, None]: + get = lambda name: self.running_data[self.manager.get_elem_by_name(name)] + self.running = True + lang = get("top.lang") + output_dir = get_save_dir(get("top.model_name"), get("top.finetuning_type"), get( + "{}.output_dir".format("train" if self.do_train else "eval") + )) + + while self.thread.is_alive(): + time.sleep(2) + if self.aborted: + yield ALERTS["info_aborting"][lang], gr.update(visible=False) + else: + yield self.logger_handler.log, update_process_bar(self.trainer_callback) + + if self.do_train: + if os.path.exists(os.path.join(output_dir, TRAINING_ARGS_NAME)): + finish_info = ALERTS["info_finished"][lang] + else: + finish_info = ALERTS["err_failed"][lang] + else: + if os.path.exists(os.path.join(output_dir, "all_results.json")): + finish_info = get_eval_results(os.path.join(output_dir, "all_results.json")) + else: + finish_info = ALERTS["err_failed"][lang] + + yield self._finalize(lang, finish_info), gr.update(visible=False) diff --git a/LLM-Detector-V4-11w/src/llmtuner/webui/utils.py b/LLM-Detector-V4-11w/src/llmtuner/webui/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..34dd18044ca957d0a6e0359c9b68d643c7a2f505 --- /dev/null +++ b/LLM-Detector-V4-11w/src/llmtuner/webui/utils.py @@ -0,0 +1,89 @@ +import os +import json +import gradio as gr +from typing import TYPE_CHECKING, Any, Dict +from datetime import datetime + +from llmtuner.extras.packages import is_matplotlib_available +from llmtuner.extras.ploting import smooth +from llmtuner.webui.common import get_save_dir + +if TYPE_CHECKING: + from llmtuner.extras.callbacks import LogCallback + +if is_matplotlib_available(): + import matplotlib.figure + import matplotlib.pyplot as plt + + +def update_process_bar(callback: "LogCallback") -> Dict[str, Any]: + if not callback.max_steps: + return gr.update(visible=False) + + percentage = round(100 * callback.cur_steps / callback.max_steps, 0) if callback.max_steps != 0 else 100.0 + label = "Running {:d}/{:d}: {} < {}".format( + callback.cur_steps, + callback.max_steps, + callback.elapsed_time, + callback.remaining_time + ) + return gr.update(label=label, value=percentage, visible=True) + + +def get_time() -> str: + return datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + + +def can_quantize(finetuning_type: str) -> Dict[str, Any]: + if finetuning_type != "lora": + return gr.update(value="None", interactive=False) + else: + return gr.update(interactive=True) + + +def gen_cmd(args: Dict[str, Any]) -> str: + args.pop("disable_tqdm", None) + args["plot_loss"] = args.get("do_train", None) + current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0") + cmd_lines = ["CUDA_VISIBLE_DEVICES={} python src/train_bash.py ".format(current_devices)] + for k, v in args.items(): + if v is not None and v != "": + cmd_lines.append(" --{} {} ".format(k, str(v))) + cmd_text = "\\\n".join(cmd_lines) + cmd_text = "```bash\n{}\n```".format(cmd_text) + return cmd_text + + +def get_eval_results(path: os.PathLike) -> str: + with open(path, "r", encoding="utf-8") as f: + result = json.dumps(json.load(f), indent=4) + return "```json\n{}\n```\n".format(result) + + +def gen_plot(base_model: str, finetuning_type: str, output_dir: str) -> "matplotlib.figure.Figure": + if not base_model: + return + log_file = get_save_dir(base_model, finetuning_type, output_dir, "trainer_log.jsonl") + if not os.path.isfile(log_file): + return + + plt.close("all") + fig = plt.figure() + ax = fig.add_subplot(111) + steps, losses = [], [] + with open(log_file, "r", encoding="utf-8") as f: + for line in f: + log_info = json.loads(line) + if log_info.get("loss", None): + steps.append(log_info["current_steps"]) + losses.append(log_info["loss"]) + + if len(losses) == 0: + return None + + ax.plot(steps, losses, alpha=0.4, label="original") + ax.plot(steps, smooth(losses), label="smoothed") + ax.legend() + ax.set_xlabel("step") + ax.set_ylabel("loss") + return fig diff --git a/LLM-Detector-V4-11w/src/train_bash.py b/LLM-Detector-V4-11w/src/train_bash.py new file mode 100644 index 0000000000000000000000000000000000000000..9ddd0586dde8e2c84b61d361ac42a44277ee9337 --- /dev/null +++ b/LLM-Detector-V4-11w/src/train_bash.py @@ -0,0 +1,14 @@ +from llmtuner import run_exp + + +def main(): + run_exp() + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/LLM-Detector-V4-11w/src/train_web.py b/LLM-Detector-V4-11w/src/train_web.py new file mode 100644 index 0000000000000000000000000000000000000000..38efd64d65c5d1a89af9a592c59a868a41b8dcff --- /dev/null +++ b/LLM-Detector-V4-11w/src/train_web.py @@ -0,0 +1,11 @@ +from llmtuner import create_ui + + +def main(): + demo = create_ui() + demo.queue() + demo.launch(server_name="0.0.0.0", server_port=7860, share=False, inbrowser=True) + + +if __name__ == "__main__": + main() diff --git a/LLM-Detector-V4-11w/src/web_demo.py b/LLM-Detector-V4-11w/src/web_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..257536abcfd5368c208096fab0c9d83fcb254470 --- /dev/null +++ b/LLM-Detector-V4-11w/src/web_demo.py @@ -0,0 +1,11 @@ +from llmtuner import create_web_demo + + +def main(): + demo = create_web_demo() + demo.queue() + demo.launch(server_name="0.0.0.0", server_port=7860, share=False, inbrowser=True) + + +if __name__ == "__main__": + main()