Jae-Won Chung commited on
Commit
19b22c9
·
0 Parent(s):

Initial commit

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +13 -0
  3. inference.py +131 -0
  4. pyrightconfig.json +3 -0
  5. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .envrc
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ML.ENERGY Leaderboard
2
+
3
+ ## Devs
4
+
5
+ Currently setup in `ampere02`:
6
+
7
+ 1. Find model weights in `/data/leaderboard/weights/`, e.g. subdirectory `llama` and `vicuna`.
8
+
9
+ 2. Let's share the Huggingface Transformer cache:
10
+
11
+ ```bash
12
+ export TRANSFORMERS_CACHE=/data/leaderboard/hfcache
13
+ ```
inference.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Perform inference of one model on one input prompt and measure time and energy."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ import tyro
8
+ import rich
9
+ import torch
10
+ from fastchat.serve.inference import generate_stream
11
+ from fastchat.model.model_adapter import load_model, get_conversation_template
12
+ from zeus.monitor import ZeusMonitor
13
+
14
+ SYSTEM_PROMPTS = {
15
+ "chat": (
16
+ "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. "
17
+ "The assistant gives helpful, detailed, and polite answers to the user's questions."
18
+ ),
19
+ "chat-concise": (
20
+ "A chat between a human user (prompter) and an artificial intelligence (AI) assistant. "
21
+ "The assistant gives helpful, detailed, and polite answers to the user's questions. "
22
+ "The assistnat's answers are concise but high-quality."
23
+ ),
24
+ "instruct": (
25
+ "Below is an instruction that describes a task. "
26
+ "Write a response that appropriately completes the request."
27
+ ),
28
+ "instruct-concise": (
29
+ "Below is an instruction that describes a task. "
30
+ "Write a response that appropriately completes the request."
31
+ "The response should be concise but high-quality."
32
+ ),
33
+ }
34
+
35
+
36
+ def main(
37
+ model_path: str,
38
+ input_prompt: str,
39
+ device_index: int = 0,
40
+ task: Literal[tuple(SYSTEM_PROMPTS)] = "chat", # type: ignore
41
+ load_8bit: bool = False,
42
+ temperature: float = 0.7,
43
+ repitition_penalty: float = 1.0,
44
+ max_new_tokens: int = 512,
45
+ ) -> None:
46
+ """Run the main routine.
47
+
48
+ Code structure is based on
49
+ https://github.com/lm-sys/FastChat/blob/57dea54055/fastchat/serve/inference.py#L249
50
+
51
+ Args:
52
+ model_path: Path to or Huggingface Hub Id of the model.
53
+ input_prompt: Input prompt to use for inference.
54
+ device_index: Index of the GPU to use for inference.
55
+ task: Type of task to perform inference on.
56
+ load_8bit: Whether to load the model in 8-bit mode.
57
+ temperature: Temperature to use for sampling.
58
+ repitition_penalty: Repitition penalty to use for the model.
59
+ max_new_tokens: Maximum numbers of tokens to generate, ignoring the prompt.
60
+ """
61
+ # NOTE(JW): ChatGLM is implemented as a special case in FastChat inference.
62
+ # Also, it's primarily a model that's fine-tuned for Chinese, so it doesn't
63
+ # make sense to prompt it in English and talk about its verbosity.
64
+ if "chatglm" in model_path.lower():
65
+ raise ValueError("ChatGLM is not supported.")
66
+
67
+ # Set the device.
68
+ torch.cuda.set_device(f"cuda:{device_index}")
69
+
70
+ # Load the model (Huggingface PyTorch) and tokenizer (Huggingface).
71
+ model, tokenizer = load_model(
72
+ model_path=model_path,
73
+ device="cuda",
74
+ num_gpus=1,
75
+ max_gpu_memory=None,
76
+ load_8bit=load_8bit,
77
+ cpu_offloading=False,
78
+ gptq_config=None,
79
+ debug=False,
80
+ )
81
+
82
+ # Chats are accumulated in a conversation helper object.
83
+ conv = get_conversation_template(model_path)
84
+
85
+ # Standardize the system prompt for every model.
86
+ conv.system = SYSTEM_PROMPTS[task]
87
+ conv.messages = []
88
+ conv.offset = 0
89
+
90
+ # Construct the input prompt.
91
+ conv.append_message(conv.roles[0], input_prompt)
92
+ conv.append_message(conv.roles[1], "")
93
+ prompt = conv.get_prompt()
94
+
95
+ # Generate the ouptut from the model.
96
+ gen_params = {
97
+ "model": model_path,
98
+ "prompt": prompt,
99
+ "temperature": temperature,
100
+ "repitition_penalty": repitition_penalty,
101
+ "max_new_tokens": max_new_tokens,
102
+ "stop": conv.stop_str,
103
+ "stop_token_ids": conv.stop_token_ids,
104
+ "echo": False,
105
+ }
106
+ output_stream = generate_stream(model, tokenizer, gen_params, device="cuda")
107
+ output = {}
108
+
109
+ # Inference and measurement!
110
+ monitor = ZeusMonitor(gpu_indices=[torch.cuda.current_device()])
111
+ monitor.begin_window("inference")
112
+ for output in output_stream:
113
+ pass
114
+ measurements = monitor.end_window("inference")
115
+
116
+ # Print the input and output.
117
+ rich.print(f"\n[u]Prompt[/u]:\n{prompt.strip()}\n")
118
+ output_text = output["text"]
119
+ rich.print(f"\n[u]Response[/u]:\n{output_text.strip()}\n")
120
+
121
+ # Print numbers.
122
+ num_tokens = len(tokenizer.encode(output_text))
123
+ rich.print(measurements)
124
+ rich.print(f"Number of tokens: {num_tokens}")
125
+ rich.print(f"Tokens per seconds: {num_tokens / measurements.time:.2f}")
126
+ rich.print(f"Joules per token: {measurements.total_energy / num_tokens:.2f}")
127
+ rich.print(f"Average power consumption: {measurements.total_energy / measurements.time:.2f}")
128
+
129
+
130
+ if __name__ == "__main__":
131
+ tyro.cli(main)
pyrightconfig.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "extraPaths": ["../zeus", "../fastchat"],
3
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ zeus-ml==0.4.0
2
+ fastchat==0.2.14
3
+ einops==0.6.1
4
+ tyro==0.5.3
5
+ rwkv==0.7.5