File size: 8,380 Bytes
a679cf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
"""Perform inference of one model on one input prompt and measure time and energy."""

from __future__ import annotations

import os
import json
import copy
import atexit
from typing import Generator, Literal

import tyro
import torch
import rich
from rich.table import Table
from fastchat.serve.inference import generate_stream
from fastchat.model.model_adapter import load_model, get_conversation_template
from zeus.monitor import ZeusMonitor

SYSTEM_PROMPTS = {
    "chat": (
        "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's questions."
    ),
    "chat-concise": (
        "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. "
        "The assistant gives helpful, detailed, and polite answers to the user's questions. "
        "The assistnat's answers are concise but high-quality."
    ),
    "instruct": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request."
    ),
    "instruct-concise": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request."
        "The response should be concise but high-quality."
    ),
}


def main(
    model_path: str,
    input_file: str,
    device_index: int = 0,
    task: Literal[tuple(SYSTEM_PROMPTS)] = "chat",  # type: ignore
    load_8bit: bool = False,
    temperature: float = 0.7,
    repitition_penalty: float = 1.0,
    max_new_tokens: int = 512,
) -> None:
    """Run the main routine.

    Code structure is based on
    https://github.com/lm-sys/FastChat/blob/57dea54055/fastchat/serve/inference.py#L249

    Args:
        model_path: Path to or Huggingface Hub Id of the model.
        input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
        device_index: Index of the GPU to use for inference.
        task: Type of task to perform inference on.
        load_8bit: Whether to load the model in 8-bit mode.
        temperature: Temperature to use for sampling.
        repitition_penalty: Repitition penalty to use for the model.
        max_new_tokens: Maximum numbers of tokens to generate, ignoring the prompt.
    """
    # NOTE(JW): ChatGLM is implemented as a special case in FastChat inference.
    # Also, it's primarily a model that's fine-tuned for Chinese, so it doesn't
    # make sense to prompt it in English and talk about its verbosity.
    if "chatglm" in model_path.lower():
        raise ValueError("ChatGLM is not supported.")

    # Print out what we're about to do.
    model_name_cleaned = "--".join(model_path.split("/")[-2:])
    output_dir = f"data/{task}/{model_name_cleaned}"
    output_csv_path = f"{output_dir}/benchmark.json"
    config_json_path = f"{output_dir}/config.json"
    table = Table(title="Benchmark")
    table.add_column("Configuration")
    table.add_column("Value")
    table.add_row("Model", f"{model_name_cleaned} (path: {model_path})")
    table.add_row("Input", input_file)
    table.add_row("Device", f"cuda:{device_index}")
    table.add_row("Task", task)
    table.add_row("8-bit", str(load_8bit))
    table.add_row("Temperature", f"{temperature:.2f}")
    table.add_row("Repitition Penalty", f"{repitition_penalty:.2f}")
    table.add_row("Max New Tokens", str(max_new_tokens))
    table.add_row("Output CSV", output_csv_path)
    table.add_row("Config JSON", config_json_path)
    rich.get_console().print(table)

    # Set the device.
    torch.cuda.set_device(f"cuda:{device_index}")

    # Load the model (Huggingface PyTorch) and tokenizer (Huggingface).
    model, tokenizer = load_model(
        model_path=model_path,
        device="cuda",
        num_gpus=1,
        max_gpu_memory=None,
        load_8bit=load_8bit,
        cpu_offloading=False,
        gptq_config=None,
        debug=False,
    )

    # Chats are accumulated in a conversation helper object.
    conv_base = get_conversation_template(model_path)

    # Standardize the system prompt for every model.
    conv_base.system = SYSTEM_PROMPTS[task]
    conv_base.messages = []
    conv_base.offset = 0

    gen_params = {
        "model": model_path,
        "prompt": "EMPTY",
        "temperature": temperature,
        "repitition_penalty": repitition_penalty,
        "max_new_tokens": max_new_tokens,
        "stop": conv_base.stop_str,
        "stop_token_ids": conv_base.stop_token_ids,
        "echo": False,
    }

    monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])

    # Output files.
    # Leave only the last two path components and replace slashes with double dashes.
    os.makedirs(output_dir, exist_ok=True)
    output_json = open(output_csv_path, "w")
    output_json.write("[\n")
    output_json.flush()
    # Conclude the JSON file format with a closing bracket. Using `atexit` will
    # handle all cases of the program exiting, including Ctrl-C and errors.
    atexit.register(lambda: output_json.write("\n]\n"))

    # Dump the configuration to a JSON file.
    with open(config_json_path, "w") as config_json:
        json.dump(
            {
                "model_path": model_path,
                "input_file": input_file,
                "device_index": device_index,
                "task": task,
                "load_8bit": load_8bit,
                "temperature": temperature,
                "repitition_penalty": repitition_penalty,
                "max_new_tokens": max_new_tokens,
            },
            config_json,
            indent=4,
        )
        config_json.write("\n")

    def dataloader(input_file: str) -> Generator[tuple[bool, str], None, None]:
        """Yields a tuple of whether this is a warmup run and the input prompt."""
        for _ in range(3):
            yield True, "Say something long and random. I don't care about the content."
        for item in json.load(open(input_file, "r")):
            input_prompt = item["conversations"][0]["value"]
            yield False, input_prompt

    # Warm up the GPU with some random prompts.
    # Forward through all the prompts.
    is_first = True
    for is_warmup, input_prompt in dataloader(input_file):
        # Construct the input prompt.
        conv = copy.deepcopy(conv_base)
        conv.append_message(conv.roles[0], input_prompt)
        conv.append_message(conv.roles[1], "")
        prompt = conv.get_prompt()
        gen_params["prompt"] = prompt

        # Print input prompt.
        rich.print(f"\n[u]{'Warmup ' if is_warmup else ''}Prompt[/u]:\n{prompt.strip()}\n")

        # Generate the ouptut from the model.
        output_stream = generate_stream(model, tokenizer, gen_params, device="cuda")
        output = {}

        #################################################
        # Inference and measurement zone!
        #################################################
        monitor.begin_window("inference")
        for output in output_stream:
            pass
        measurements = monitor.end_window("inference")
        #################################################
        
        # Record numbers.
        output_text = output["text"]
        if not is_warmup:
            response_length = len(tokenizer.encode(output_text))  # number of tokens
            latency = measurements.time
            throughput = response_length / latency
            energy = measurements.total_energy
            output = {
                "model": model_name_cleaned,
                "throughput": throughput,
                "response_length": response_length,
                "latency": latency,
                "energy": energy,
                "input": prompt.strip(),
                "output": output_text.strip(),
            }
            output_str = json.dumps(output, indent=4)
            if not is_warmup:
                if not is_first:
                    output_json.write(",\n" + output_str)
                else:
                    is_first = False
                    output_json.write(output_str)
            output_json.flush()

        # Print the response.
        rich.print(f"\n[u]{'Warmup ' if is_warmup else ''}Response[/u]:\n{output_text.strip()}\n")

        # Print measurement.
        rich.print(measurements)


if __name__ == "__main__":
    tyro.cli(main)