JIACHEN LIU commited on
Commit
d49d71b
·
unverified ·
2 Parent(s): 08644b5 a679cf2

Merge pull request #1 from ml-energy/jw-add-benchmark

Browse files
.gitignore CHANGED
@@ -2,4 +2,3 @@
2
  .envrc
3
  pyrightconfig.json
4
  .idea
5
- .DS_Store
 
2
  .envrc
3
  pyrightconfig.json
4
  .idea
 
README.md CHANGED
@@ -20,3 +20,10 @@ Currently setup in `ampere02`:
20
  ```bash
21
  export TRANSFORMERS_CACHE=/data/leaderboard/hfcache
22
  ```
 
 
 
 
 
 
 
 
20
  ```bash
21
  export TRANSFORMERS_CACHE=/data/leaderboard/hfcache
22
  ```
23
+
24
+ Run benchmarks like this:
25
+
26
+ ```console
27
+ $ python benchmark.py --model-path /data/leaderboard/weights/lmsys/vicuna-7B --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
28
+ $ python benchmark.py --model-path databricks/dolly-v2-12b --input-file /data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json
29
+ ```
inference.py → benchmark.py RENAMED
@@ -2,11 +2,16 @@
2
 
3
  from __future__ import annotations
4
 
5
- from typing import Literal
 
 
 
 
6
 
7
  import tyro
8
- import rich
9
  import torch
 
 
10
  from fastchat.serve.inference import generate_stream
11
  from fastchat.model.model_adapter import load_model, get_conversation_template
12
  from zeus.monitor import ZeusMonitor
@@ -35,7 +40,7 @@ SYSTEM_PROMPTS = {
35
 
36
  def main(
37
  model_path: str,
38
- input_prompt: str,
39
  device_index: int = 0,
40
  task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
41
  load_8bit: bool = False,
@@ -50,7 +55,7 @@ def main(
50
 
51
  Args:
52
  model_path: Path to or Huggingface Hub Id of the model.
53
- input_prompt: Input prompt to use for inference.
54
  device_index: Index of the GPU to use for inference.
55
  task: Type of task to perform inference on.
56
  load_8bit: Whether to load the model in 8-bit mode.
@@ -64,6 +69,26 @@ def main(
64
  if "chatglm" in model_path.lower():
65
  raise ValueError("ChatGLM is not supported.")
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # Set the device.
68
  torch.cuda.set_device(f"cuda:{device_index}")
69
 
@@ -80,51 +105,119 @@ def main(
80
  )
81
 
82
  # Chats are accumulated in a conversation helper object.
83
- conv = get_conversation_template(model_path)
84
 
85
  # Standardize the system prompt for every model.
86
- conv.system = SYSTEM_PROMPTS[task]
87
- conv.messages = []
88
- conv.offset = 0
89
 
90
- # Construct the input prompt.
91
- conv.append_message(conv.roles[0], input_prompt)
92
- conv.append_message(conv.roles[1], "")
93
- prompt = conv.get_prompt()
94
-
95
- # Generate the ouptut from the model.
96
  gen_params = {
97
  "model": model_path,
98
- "prompt": prompt,
99
  "temperature": temperature,
100
  "repitition_penalty": repitition_penalty,
101
  "max_new_tokens": max_new_tokens,
102
- "stop": conv.stop_str,
103
- "stop_token_ids": conv.stop_token_ids,
104
  "echo": False,
105
  }
106
- output_stream = generate_stream(model, tokenizer, gen_params, device="cuda")
107
- output = {}
108
 
109
- # Inference and measurement!
110
  monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])
111
- monitor.begin_window("inference")
112
- for output in output_stream:
113
- pass
114
- measurements = monitor.end_window("inference")
115
-
116
- # Print the input and output.
117
- rich.print(f"\n[u]Prompt[/u]:\n{prompt.strip()}\n")
118
- output_text = output["text"]
119
- rich.print(f"\n[u]Response[/u]:\n{output_text.strip()}\n")
120
-
121
- # Print numbers.
122
- num_tokens = len(tokenizer.encode(output_text))
123
- rich.print(measurements)
124
- rich.print(f"Number of tokens: {num_tokens}")
125
- rich.print(f"Tokens per seconds: {num_tokens / measurements.time:.2f}")
126
- rich.print(f"Joules per token: {measurements.total_energy / num_tokens:.2f}")
127
- rich.print(f"Average power consumption: {measurements.total_energy / measurements.time:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
 
130
  if __name__ == "__main__":
 
2
 
3
  from __future__ import annotations
4
 
5
+ import os
6
+ import json
7
+ import copy
8
+ import atexit
9
+ from typing import Generator, Literal
10
 
11
  import tyro
 
12
  import torch
13
+ import rich
14
+ from rich.table import Table
15
  from fastchat.serve.inference import generate_stream
16
  from fastchat.model.model_adapter import load_model, get_conversation_template
17
  from zeus.monitor import ZeusMonitor
 
40
 
41
  def main(
42
  model_path: str,
43
+ input_file: str,
44
  device_index: int = 0,
45
  task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
46
  load_8bit: bool = False,
 
55
 
56
  Args:
57
  model_path: Path to or Huggingface Hub Id of the model.
58
+ input_file: Path to the input JSON file. Assumed to be our cleaned ShareGPT data.
59
  device_index: Index of the GPU to use for inference.
60
  task: Type of task to perform inference on.
61
  load_8bit: Whether to load the model in 8-bit mode.
 
69
  if "chatglm" in model_path.lower():
70
  raise ValueError("ChatGLM is not supported.")
71
 
72
+ # Print out what we're about to do.
73
+ model_name_cleaned = "--".join(model_path.split("/")[-2:])
74
+ output_dir = f"data/{task}/{model_name_cleaned}"
75
+ output_csv_path = f"{output_dir}/benchmark.json"
76
+ config_json_path = f"{output_dir}/config.json"
77
+ table = Table(title="Benchmark")
78
+ table.add_column("Configuration")
79
+ table.add_column("Value")
80
+ table.add_row("Model", f"{model_name_cleaned} (path: {model_path})")
81
+ table.add_row("Input", input_file)
82
+ table.add_row("Device", f"cuda:{device_index}")
83
+ table.add_row("Task", task)
84
+ table.add_row("8-bit", str(load_8bit))
85
+ table.add_row("Temperature", f"{temperature:.2f}")
86
+ table.add_row("Repitition Penalty", f"{repitition_penalty:.2f}")
87
+ table.add_row("Max New Tokens", str(max_new_tokens))
88
+ table.add_row("Output CSV", output_csv_path)
89
+ table.add_row("Config JSON", config_json_path)
90
+ rich.get_console().print(table)
91
+
92
  # Set the device.
93
  torch.cuda.set_device(f"cuda:{device_index}")
94
 
 
105
  )
106
 
107
  # Chats are accumulated in a conversation helper object.
108
+ conv_base = get_conversation_template(model_path)
109
 
110
  # Standardize the system prompt for every model.
111
+ conv_base.system = SYSTEM_PROMPTS[task]
112
+ conv_base.messages = []
113
+ conv_base.offset = 0
114
 
 
 
 
 
 
 
115
  gen_params = {
116
  "model": model_path,
117
+ "prompt": "EMPTY",
118
  "temperature": temperature,
119
  "repitition_penalty": repitition_penalty,
120
  "max_new_tokens": max_new_tokens,
121
+ "stop": conv_base.stop_str,
122
+ "stop_token_ids": conv_base.stop_token_ids,
123
  "echo": False,
124
  }
 
 
125
 
 
126
  monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])
127
+
128
+ # Output files.
129
+ # Leave only the last two path components and replace slashes with double dashes.
130
+ os.makedirs(output_dir, exist_ok=True)
131
+ output_json = open(output_csv_path, "w")
132
+ output_json.write("[\n")
133
+ output_json.flush()
134
+ # Conclude the JSON file format with a closing bracket. Using `atexit` will
135
+ # handle all cases of the program exiting, including Ctrl-C and errors.
136
+ atexit.register(lambda: output_json.write("\n]\n"))
137
+
138
+ # Dump the configuration to a JSON file.
139
+ with open(config_json_path, "w") as config_json:
140
+ json.dump(
141
+ {
142
+ "model_path": model_path,
143
+ "input_file": input_file,
144
+ "device_index": device_index,
145
+ "task": task,
146
+ "load_8bit": load_8bit,
147
+ "temperature": temperature,
148
+ "repitition_penalty": repitition_penalty,
149
+ "max_new_tokens": max_new_tokens,
150
+ },
151
+ config_json,
152
+ indent=4,
153
+ )
154
+ config_json.write("\n")
155
+
156
+ def dataloader(input_file: str) -> Generator[tuple[bool, str], None, None]:
157
+ """Yields a tuple of whether this is a warmup run and the input prompt."""
158
+ for _ in range(3):
159
+ yield True, "Say something long and random. I don't care about the content."
160
+ for item in json.load(open(input_file, "r")):
161
+ input_prompt = item["conversations"][0]["value"]
162
+ yield False, input_prompt
163
+
164
+ # Warm up the GPU with some random prompts.
165
+ # Forward through all the prompts.
166
+ is_first = True
167
+ for is_warmup, input_prompt in dataloader(input_file):
168
+ # Construct the input prompt.
169
+ conv = copy.deepcopy(conv_base)
170
+ conv.append_message(conv.roles[0], input_prompt)
171
+ conv.append_message(conv.roles[1], "")
172
+ prompt = conv.get_prompt()
173
+ gen_params["prompt"] = prompt
174
+
175
+ # Print input prompt.
176
+ rich.print(f"\n[u]{'Warmup ' if is_warmup else ''}Prompt[/u]:\n{prompt.strip()}\n")
177
+
178
+ # Generate the ouptut from the model.
179
+ output_stream = generate_stream(model, tokenizer, gen_params, device="cuda")
180
+ output = {}
181
+
182
+ #################################################
183
+ # Inference and measurement zone!
184
+ #################################################
185
+ monitor.begin_window("inference")
186
+ for output in output_stream:
187
+ pass
188
+ measurements = monitor.end_window("inference")
189
+ #################################################
190
+
191
+ # Record numbers.
192
+ output_text = output["text"]
193
+ if not is_warmup:
194
+ response_length = len(tokenizer.encode(output_text)) # number of tokens
195
+ latency = measurements.time
196
+ throughput = response_length / latency
197
+ energy = measurements.total_energy
198
+ output = {
199
+ "model": model_name_cleaned,
200
+ "throughput": throughput,
201
+ "response_length": response_length,
202
+ "latency": latency,
203
+ "energy": energy,
204
+ "input": prompt.strip(),
205
+ "output": output_text.strip(),
206
+ }
207
+ output_str = json.dumps(output, indent=4)
208
+ if not is_warmup:
209
+ if not is_first:
210
+ output_json.write(",\n" + output_str)
211
+ else:
212
+ is_first = False
213
+ output_json.write(output_str)
214
+ output_json.flush()
215
+
216
+ # Print the response.
217
+ rich.print(f"\n[u]{'Warmup ' if is_warmup else ''}Response[/u]:\n{output_text.strip()}\n")
218
+
219
+ # Print measurement.
220
+ rich.print(measurements)
221
 
222
 
223
  if __name__ == "__main__":
data/chat/lmsys--vicuna-7B/benchmark.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "lmsys--vicuna-7B",
4
+ "throughput": 30.558443157709224,
5
+ "response_length": 331,
6
+ "latency": 10.831703640520573,
7
+ "energy": 2215.2789999842644,
8
+ "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients... ASSISTANT:",
9
+ "output": "Sure, here are the main ideas of Jeff Walker's Product Launch Formula and how they can be applied to a growth marketing agency implementing these strategies and tactics for their clients:\n\n1. Pre-launch phase:\n* Build an audience through content marketing, social media, and other channels.\n* Create a sense of urgency and exclusivity by offering early access or special bonuses to those on a mailing list.\n* Use teasers, sneak peeks, and other forms of anticipation to build excitement.\n2. Launch phase:\n* Use a webinar or live event to launch the product, using stories, testimonials, and social proof to build trust and credibility.\n* Offer a limited-time special offer or bonus to those who purchase during the launch.\n* Continually communicate with the audience during the launch phase, answering questions, providing updates, and building relationships.\n3. Post-launch phase:\n* Follow up with customers to make sure they are satisfied and provide additional support and resources.\n* Use customer testimonials, success stories, and case studies to continue building social proof and credibility.\n* Continually update and improve the product based on customer feedback.\n\nOverall, the Product Launch Formula emphasizes the importance of building an engaged audience, creating urgency and exclusivity, and providing ongoing support and value to customers. By implementing these strategies and tactics, a growth marketing agency can help their clients launch successful products and grow their businesses."
10
+ },
11
+ {
12
+ "model": "lmsys--vicuna-7B",
13
+ "throughput": 31.514563598172494,
14
+ "response_length": 98,
15
+ "latency": 3.109673395752907,
16
+ "energy": 640.4210000038147,
17
+ "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: How to tell if a customer segment is well segmented? In 3 bullet points. ASSISTANT:",
18
+ "output": "1. Homogeneity: The segment should consist of customers who share similar characteristics, behaviors, or needs.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics, behaviors, or needs.\n3. Stability: The segment should remain relatively consistent over time and not change drastically. If the segment is constantly changing or becoming more similar to other segments, it may indicate that the segmentation is not effective."
19
+ },
20
+ {
21
+ "model": "lmsys--vicuna-7B",
22
+ "throughput": 30.362787676034074,
23
+ "response_length": 346,
24
+ "latency": 11.39552809484303,
25
+ "energy": 2334.6629999876022,
26
+ "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: In Java, I want to replace string like \"This is a new {object} at {place}\" with a Map, {object: \"student\", \"point 3, 4\"}, and get a result \"This is a new student at point 3, 4\". How can I do? ASSISTANT:",
27
+ "output": "To replace the string with a map, you can use the `String.format()` method and pass in an object that contains the key-value pairs of the map. Here's an example of how you can do this in Java:\n```\nString originalString = \"This is a new {object} at {place}\";\nString formattedString = String.format(\"This is a new %s at %s\",\n new ObjectMapper().writeValueAsString(map),\n new ObjectMapper().writeValueAsString(place));\n```\nIn this example, `originalString` is the original string that you want to replace, `map` is the map that you want to use to replace the {object} and {place} placeholders, and `ObjectMapper` is a class that is used to convert the map to a string. The `writeValueAsString()` method is used to convert the map to a JSON string, which can then be passed to the `String.format()` method to replace the placeholders in the string.\n\nTo get the result \"This is a new student at point 3, 4\", you can use the following code:\n```\nString result = String.format(\"This is a new %s at %s\",\n new ObjectMapper().writeValueAsString(map),\n new ObjectMapper().writeValueAsString(place));\n```\nYou can then pass the `result` string to a `println` statement or use it in any other way that you like.\n\nI hope this helps! Let me know if you have any other questions."
28
+ },
29
+ {
30
+ "model": "lmsys--vicuna-7B",
31
+ "throughput": 31.208981333084907,
32
+ "response_length": 96,
33
+ "latency": 3.076037598773837,
34
+ "energy": 649.0310000181198,
35
+ "input": "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Metaphorical language is also used to describe the various addressing modes of the instructions. Grandiose language to express their excitement and admiration for the functionality of the instructions being described. Now, rewrite this with more perplexity:\n\nJMP ABCD\nMOV AX, [BX+SI]\nMOV AX, [100]\nMOV AX, [BX]\nMOV AX, [BX\\*2+SI]\nMOV AX, BX\nMOV AX, 7 ASSISTANT:",
36
+ "output": "The instructions described seem to possess a remarkable array of addressing modes, each with its own unique set of characteristics. The use of grandiose language to express excitement and admiration for these instructions is certainly understandable, as their functionality seems to transcend the mere execution of basic computations. With so many different addressing modes to choose from, one might be forgiven for feeling a sense of perplexity and wonder at the sheer complexity of these instructions."
37
+ }
38
+ ]
data/chat/lmsys--vicuna-7B/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_path": "/data/leaderboard/weights/lmsys/vicuna-7B",
3
+ "input_file": "/data/leaderboard/sharegpt/sg_90k_part1_html_cleaned_lang_first_sampled.json",
4
+ "device_index": 0,
5
+ "task": "chat",
6
+ "load_8bit": false,
7
+ "temperature": 0.7,
8
+ "repitition_penalty": 1.0,
9
+ "max_new_tokens": 512
10
+ }
sharegpt/README.md CHANGED
@@ -1,5 +1,6 @@
 
1
 
2
- ## Download ShareGPT :
3
  ```
4
  https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
5
 
@@ -8,30 +9,21 @@ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolv
8
 
9
  ## Install Fastchat
10
  ```
11
- pip3 install fastchat
12
  ```
13
 
14
  ## Clean data:
15
  ```
16
- pip3 install polyglot pyicu pycld2
17
- python3 -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
18
  ```
19
 
20
- ## Extract first sentence (optional)
21
  ```
22
  python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
23
  ```
24
 
25
- ## Sample data (optional)
26
  ```
27
- python3 -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
28
  ```
29
-
30
- ## ShareGPT Feeder Usage
31
-
32
- ```
33
- from sharegpt_feeder import generator
34
- sharegpt_generator = generator()
35
- print(next(sharegpt_generator))
36
- print(next(sharegpt_generator))
37
- ```
 
1
+ # How we used ShareGPT to create our benchmark dataset
2
 
3
+ ## Download ShareGPT
4
  ```
5
  https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
6
 
 
9
 
10
  ## Install Fastchat
11
  ```
12
+ pip install fastchat
13
  ```
14
 
15
  ## Clean data:
16
  ```
17
+ pip install polyglot pyicu pycld2
18
+ python -m fastchat.data.optional_clean --in sg_90k_part1_html_cleaned.json --out sg_90k_part1_html_cleaned_lang.json --keep-lang en
19
  ```
20
 
21
+ ## Extract first prompt
22
  ```
23
  python extract_first.py --in-file sg_90k_part1_html_cleaned_lang.json --out-file sg_90k_part1_html_cleaned_lang_first.json
24
  ```
25
 
26
+ ## Sample data
27
  ```
28
+ python -m fastchat.data.sample --in sg_90k_part1_html_cleaned_lang_first.json --out sg_90k_part1_html_cleaned_lang_first_sampled.json --end 10000 --max-length 10000
29
  ```
 
 
 
 
 
 
 
 
 
sharegpt/sharegpt_feeder.py DELETED
@@ -1,15 +0,0 @@
1
- ''' Usage
2
- sharegpt_generator = sharegpt_generator()
3
- print(next(sharegpt_generator))
4
- print(next(sharegpt_generator))
5
- print(next(sharegpt_generator))
6
- '''
7
- import json
8
-
9
- def sharegpt_generator(file = 'sg_90k_part1_html_cleaned_lang.json'):
10
- content = json.load(open(file, "r"))
11
- for item in content:
12
- yield item['conversations'][0]['value']
13
-
14
-
15
-