Spaces:
Running
Running
Commit
Β·
76b423c
1
Parent(s):
57896bb
update
Browse files- app.py +36 -34
- requirements.txt +1 -0
- src/{flashattentionv2.py β attention.py} +78 -65
- src/bettertransformer.py +0 -144
- src/content.py +2 -2
- src/control_panel.py +37 -37
- src/{quantization_kernels.py β kernels.py} +21 -12
- src/leaderboard.py +7 -8
- src/llm_perf.py +84 -98
- src/{latency_score_memory.py β map.py} +9 -6
- src/utils.py +53 -23
app.py
CHANGED
@@ -1,26 +1,25 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
import gradio as gr
|
4 |
|
5 |
-
from src.control_panel import create_control_panel, create_control_callback, create_select_callback
|
6 |
-
from src.latency_score_memory import create_lat_score_mem_plot
|
7 |
-
from src.quantization_kernels import create_quant_plots
|
8 |
-
from src.leaderboard import create_leaderboard_table
|
9 |
-
from src.bettertransformer import create_bt_plots
|
10 |
-
from src.flashattentionv2 import create_fa2_plots
|
11 |
-
from src.llm_perf import get_llm_perf_df
|
12 |
from src.assets import custom_css
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
19 |
)
|
|
|
|
|
|
|
20 |
|
|
|
21 |
|
22 |
-
MACHINE_TO_HARDWARE = {
|
23 |
-
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
demo = gr.Blocks(css=custom_css)
|
@@ -41,22 +40,27 @@ with demo:
|
|
41 |
datatype_checkboxes,
|
42 |
optimization_checkboxes,
|
43 |
quantization_checkboxes,
|
44 |
-
) = create_control_panel()
|
45 |
####################### HARDWARE SUBTABS #######################
|
46 |
with gr.Tabs(elem_classes="subtabs"):
|
47 |
-
|
48 |
####################### LEADERBOARD TAB #######################
|
49 |
with gr.TabItem("Leaderboard π
", id=0):
|
50 |
-
search_bar, columns_checkboxes, leaderboard_table =
|
|
|
|
|
51 |
with gr.TabItem("Find Your Best Model π§", id=1):
|
52 |
-
lat_score_mem_plot = create_lat_score_mem_plot(
|
53 |
-
|
54 |
-
with gr.TabItem("
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
60 |
|
61 |
####################### CONTROL CALLBACK #######################
|
62 |
create_control_callback(
|
@@ -75,12 +79,10 @@ with demo:
|
|
75 |
# outputs
|
76 |
leaderboard_table,
|
77 |
lat_score_mem_plot,
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
quant_prefill_plot,
|
83 |
-
quant_decode_plot,
|
84 |
)
|
85 |
|
86 |
create_select_callback(
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from src.assets import custom_css
|
4 |
+
|
5 |
+
# from src.attention import create_attn_plots
|
6 |
+
from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
|
7 |
+
from src.control_panel import (
|
8 |
+
create_control_callback,
|
9 |
+
create_control_panel,
|
10 |
+
create_select_callback,
|
11 |
)
|
12 |
+
from src.leaderboard import create_leaderboard_table
|
13 |
+
from src.llm_perf import get_llm_perf_df
|
14 |
+
from src.map import create_lat_score_mem_plot
|
15 |
|
16 |
+
# from custom_kernels import create_quant_krnl_plots
|
17 |
|
18 |
+
MACHINE_TO_HARDWARE = {
|
19 |
+
"1xA10": "A10-24GB-150W π₯οΈ",
|
20 |
+
"1xA100": "A100-80GB-275W π₯οΈ",
|
21 |
+
# "1xH100": "H100-80GB-700W π₯οΈ",
|
22 |
+
}
|
23 |
|
24 |
|
25 |
demo = gr.Blocks(css=custom_css)
|
|
|
40 |
datatype_checkboxes,
|
41 |
optimization_checkboxes,
|
42 |
quantization_checkboxes,
|
43 |
+
) = create_control_panel(machine=machine)
|
44 |
####################### HARDWARE SUBTABS #######################
|
45 |
with gr.Tabs(elem_classes="subtabs"):
|
46 |
+
open_llm_perf_df = get_llm_perf_df(machine=machine)
|
47 |
####################### LEADERBOARD TAB #######################
|
48 |
with gr.TabItem("Leaderboard π
", id=0):
|
49 |
+
search_bar, columns_checkboxes, leaderboard_table = (
|
50 |
+
create_leaderboard_table(open_llm_perf_df)
|
51 |
+
)
|
52 |
with gr.TabItem("Find Your Best Model π§", id=1):
|
53 |
+
lat_score_mem_plot = create_lat_score_mem_plot(open_llm_perf_df)
|
54 |
+
###################### ATTENTIONS SPEEDUP TAB #######################
|
55 |
+
# with gr.TabItem("Attention π", id=2):
|
56 |
+
# attn_prefill_plot, attn_decode_plot = create_attn_plots(
|
57 |
+
# open_llm_perf_df
|
58 |
+
# )
|
59 |
+
# ####################### KERNELS SPEEDUP TAB #######################
|
60 |
+
# with gr.TabItem("Kernels π", id=4):
|
61 |
+
# quant_krnl_prefill_plot, quant_krnl_decode_plot = (
|
62 |
+
# create_quant_krnl_plots(llm_perf_df)
|
63 |
+
# )
|
64 |
|
65 |
####################### CONTROL CALLBACK #######################
|
66 |
create_control_callback(
|
|
|
79 |
# outputs
|
80 |
leaderboard_table,
|
81 |
lat_score_mem_plot,
|
82 |
+
# attn_prefill_plot,
|
83 |
+
# attn_decode_plot,
|
84 |
+
# quant_krnl_prefill_plot,
|
85 |
+
# quant_krnl_decode_plot,
|
|
|
|
|
86 |
)
|
87 |
|
88 |
create_select_callback(
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
huggingface_hub
|
|
|
2 |
gradio
|
3 |
plotly
|
4 |
pandas
|
|
|
1 |
huggingface_hub
|
2 |
+
transformers
|
3 |
gradio
|
4 |
plotly
|
5 |
pandas
|
src/{flashattentionv2.py β attention.py}
RENAMED
@@ -2,143 +2,156 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import plotly.express as px
|
4 |
|
5 |
-
|
6 |
-
FLASHATTENTIONV2_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
9 |
-
"
|
10 |
-
"Backend π",
|
11 |
"Params (B)",
|
12 |
"Architecture ποΈ",
|
13 |
"Open LLM Score (%)",
|
14 |
# deployment settings
|
15 |
-
"DType π₯",
|
16 |
"Backend π",
|
17 |
-
"Optimization π οΈ",
|
18 |
"Quantization ποΈ",
|
19 |
-
"
|
|
|
|
|
20 |
# primary measurements
|
21 |
"Prefill (s)",
|
22 |
-
"Prefill (s) FlashAttentionV2",
|
23 |
"Decode (tokens/s)",
|
24 |
-
"Decode (tokens/s) FlashAttentionV2",
|
25 |
-
"End-to-End (tokens/s)",
|
26 |
-
"End-to-End (tokens/s) FlashAttentionV2",
|
27 |
# speedups
|
28 |
"Prefill Speedup (%)",
|
29 |
"Decode Speedup (%)",
|
30 |
]
|
31 |
|
32 |
|
33 |
-
def
|
34 |
-
copy_df =
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
fa2_df = pd.merge(
|
40 |
-
|
41 |
fa2_df,
|
42 |
-
on=["Model π€", "Quantization
|
43 |
-
suffixes=["", "
|
44 |
)
|
|
|
|
|
|
|
45 |
# compute speedups
|
46 |
-
|
47 |
-
|
48 |
-
) - 100
|
49 |
-
|
50 |
-
(
|
51 |
).round(2) - 100
|
52 |
-
# filter speedups > 1000%
|
53 |
-
fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
|
54 |
-
fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
|
55 |
|
56 |
-
return
|
57 |
|
58 |
|
59 |
-
def
|
60 |
-
|
61 |
# plot
|
62 |
-
|
63 |
-
|
64 |
x="Architecture ποΈ",
|
65 |
-
y="
|
66 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
67 |
-
custom_data=
|
68 |
-
color="
|
69 |
points="all",
|
70 |
)
|
71 |
# add hover data
|
72 |
-
|
73 |
hovertemplate="<br>".join(
|
74 |
-
[
|
|
|
|
|
|
|
75 |
)
|
76 |
)
|
77 |
# add layout
|
78 |
-
|
79 |
title={
|
80 |
-
"text": "
|
81 |
-
"y": 0.95,
|
82 |
-
"x": 0.5,
|
83 |
"xanchor": "center",
|
84 |
"yanchor": "top",
|
|
|
|
|
85 |
},
|
|
|
86 |
xaxis_title="LLM Architecture",
|
87 |
-
|
88 |
-
legend_title="Quantization Scheme",
|
89 |
width=1200,
|
90 |
height=600,
|
91 |
)
|
92 |
|
93 |
-
return
|
94 |
|
95 |
|
96 |
-
def
|
97 |
-
|
|
|
98 |
# plot
|
99 |
-
|
100 |
-
|
101 |
x="Architecture ποΈ",
|
102 |
-
y="
|
103 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
104 |
-
custom_data=
|
105 |
-
color="
|
106 |
points="all",
|
107 |
)
|
108 |
# add hover data
|
109 |
-
|
110 |
hovertemplate="<br>".join(
|
111 |
-
[
|
|
|
|
|
|
|
112 |
)
|
113 |
)
|
114 |
# add layout
|
115 |
-
|
116 |
title={
|
117 |
-
"text": "
|
118 |
-
"y": 0.95,
|
119 |
-
"x": 0.5,
|
120 |
"xanchor": "center",
|
121 |
"yanchor": "top",
|
|
|
|
|
122 |
},
|
|
|
123 |
xaxis_title="LLM Architecture",
|
124 |
-
|
125 |
-
legend_title="Quantization Scheme",
|
126 |
width=1200,
|
127 |
height=600,
|
128 |
)
|
129 |
|
130 |
-
return
|
131 |
|
132 |
|
133 |
-
def
|
134 |
# descriptive text
|
135 |
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
136 |
# get figures
|
137 |
-
prefill_fig =
|
138 |
-
decode_fig =
|
139 |
|
140 |
# create plots
|
141 |
-
prefill_plot = gr.components.Plot(
|
|
|
|
|
142 |
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
143 |
|
144 |
return prefill_plot, decode_plot
|
|
|
2 |
import pandas as pd
|
3 |
import plotly.express as px
|
4 |
|
5 |
+
ATTN_DATA = [
|
|
|
6 |
# open llm
|
7 |
"Model π€",
|
8 |
+
"Experiment π§ͺ",
|
|
|
9 |
"Params (B)",
|
10 |
"Architecture ποΈ",
|
11 |
"Open LLM Score (%)",
|
12 |
# deployment settings
|
|
|
13 |
"Backend π",
|
|
|
14 |
"Quantization ποΈ",
|
15 |
+
"Precision π₯",
|
16 |
+
"Attention ποΈ",
|
17 |
+
"Kernel βοΈ",
|
18 |
# primary measurements
|
19 |
"Prefill (s)",
|
|
|
20 |
"Decode (tokens/s)",
|
|
|
|
|
|
|
21 |
# speedups
|
22 |
"Prefill Speedup (%)",
|
23 |
"Decode Speedup (%)",
|
24 |
]
|
25 |
|
26 |
|
27 |
+
def get_attn_df(open_llm_perf_df):
|
28 |
+
copy_df = open_llm_perf_df.copy()
|
29 |
+
copy_df["Quantization & Kernel"] = (
|
30 |
+
copy_df["Quantization ποΈ"] + " & " + copy_df["Kernel βοΈ"]
|
31 |
+
)
|
32 |
+
|
33 |
+
eager_df = copy_df[(copy_df["Attention ποΈ"] == "Eager")]
|
34 |
+
sdpa_df = copy_df[(copy_df["Attention ποΈ"] == "SDPA")]
|
35 |
+
fa2_df = copy_df[(copy_df["Attention ποΈ"] == "FAv2")]
|
36 |
+
|
37 |
+
sdpa_df = pd.merge(
|
38 |
+
eager_df,
|
39 |
+
sdpa_df,
|
40 |
+
on=["Model π€", "Quantization & Kernel"],
|
41 |
+
suffixes=["", " other"],
|
42 |
+
)
|
43 |
fa2_df = pd.merge(
|
44 |
+
eager_df,
|
45 |
fa2_df,
|
46 |
+
on=["Model π€", "Quantization & Kernel"],
|
47 |
+
suffixes=["", " other"],
|
48 |
)
|
49 |
+
|
50 |
+
attn_df = pd.concat([sdpa_df, fa2_df])
|
51 |
+
|
52 |
# compute speedups
|
53 |
+
attn_df["Prefill Speedup (%)"] = (
|
54 |
+
(attn_df["Prefill (s)"] / attn_df["Prefill (s) other"]) * 100
|
55 |
+
).round(2) - 100
|
56 |
+
attn_df["Decode Speedup (%)"] = (
|
57 |
+
(attn_df["Decode (tokens/s) other"] / attn_df["Decode (tokens/s)"]) * 100
|
58 |
).round(2) - 100
|
|
|
|
|
|
|
59 |
|
60 |
+
return attn_df
|
61 |
|
62 |
|
63 |
+
def get_attn_prefill_fig(open_llm_perf_df):
|
64 |
+
attn_df = get_attn_df(open_llm_perf_df)
|
65 |
# plot
|
66 |
+
prefill_fig = px.box(
|
67 |
+
attn_df,
|
68 |
x="Architecture ποΈ",
|
69 |
+
y="Prefill Speedup (%)",
|
70 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
71 |
+
custom_data=ATTN_DATA,
|
72 |
+
color="Attention ποΈ other",
|
73 |
points="all",
|
74 |
)
|
75 |
# add hover data
|
76 |
+
prefill_fig.update_traces(
|
77 |
hovertemplate="<br>".join(
|
78 |
+
[
|
79 |
+
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
80 |
+
for i, column in enumerate(ATTN_DATA)
|
81 |
+
]
|
82 |
)
|
83 |
)
|
84 |
# add layout
|
85 |
+
prefill_fig.update_layout(
|
86 |
title={
|
87 |
+
"text": "Prefill Speedup per Architecture, Compared To Eager Attention",
|
|
|
|
|
88 |
"xanchor": "center",
|
89 |
"yanchor": "top",
|
90 |
+
"y": 0.95,
|
91 |
+
"x": 0.5,
|
92 |
},
|
93 |
+
yaxis_title="Prefill Speedup (%)",
|
94 |
xaxis_title="LLM Architecture",
|
95 |
+
legend_title="Attention",
|
|
|
96 |
width=1200,
|
97 |
height=600,
|
98 |
)
|
99 |
|
100 |
+
return prefill_fig
|
101 |
|
102 |
|
103 |
+
def get_attn_decode_fig(open_llm_perf_df):
|
104 |
+
attn_df = get_attn_df(open_llm_perf_df)
|
105 |
+
print(len(attn_df))
|
106 |
# plot
|
107 |
+
decode_fig = px.box(
|
108 |
+
attn_df,
|
109 |
x="Architecture ποΈ",
|
110 |
+
y="Decode Speedup (%)",
|
111 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
112 |
+
custom_data=ATTN_DATA,
|
113 |
+
color="Attention ποΈ other",
|
114 |
points="all",
|
115 |
)
|
116 |
# add hover data
|
117 |
+
decode_fig.update_traces(
|
118 |
hovertemplate="<br>".join(
|
119 |
+
[
|
120 |
+
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
121 |
+
for i, column in enumerate(ATTN_DATA)
|
122 |
+
]
|
123 |
)
|
124 |
)
|
125 |
# add layout
|
126 |
+
decode_fig.update_layout(
|
127 |
title={
|
128 |
+
"text": "Decode Speedup per Architecture, Compared To Eager Attention",
|
|
|
|
|
129 |
"xanchor": "center",
|
130 |
"yanchor": "top",
|
131 |
+
"y": 0.95,
|
132 |
+
"x": 0.5,
|
133 |
},
|
134 |
+
yaxis_title="Decode Speedup (%)",
|
135 |
xaxis_title="LLM Architecture",
|
136 |
+
legend_title="Attention",
|
|
|
137 |
width=1200,
|
138 |
height=600,
|
139 |
)
|
140 |
|
141 |
+
return decode_fig
|
142 |
|
143 |
|
144 |
+
def create_attn_plots(open_llm_perf_df):
|
145 |
# descriptive text
|
146 |
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
147 |
# get figures
|
148 |
+
prefill_fig = get_attn_prefill_fig(open_llm_perf_df)
|
149 |
+
decode_fig = get_attn_decode_fig(open_llm_perf_df)
|
150 |
|
151 |
# create plots
|
152 |
+
prefill_plot = gr.components.Plot(
|
153 |
+
value=prefill_fig, elem_id="plot", show_label=False
|
154 |
+
)
|
155 |
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
156 |
|
157 |
return prefill_plot, decode_plot
|
src/bettertransformer.py
DELETED
@@ -1,144 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import pandas as pd
|
3 |
-
import plotly.express as px
|
4 |
-
|
5 |
-
|
6 |
-
BETTERTRANSFORMER_DATA = [
|
7 |
-
# open llm
|
8 |
-
"Model π€",
|
9 |
-
"DType π₯",
|
10 |
-
"Backend π",
|
11 |
-
"Params (B)",
|
12 |
-
"Architecture ποΈ",
|
13 |
-
"Open LLM Score (%)",
|
14 |
-
# deployment settings
|
15 |
-
"DType π₯",
|
16 |
-
"Backend π",
|
17 |
-
"Optimization π οΈ",
|
18 |
-
"Quantization ποΈ",
|
19 |
-
"Optimization π οΈ BetterTransformer",
|
20 |
-
# primary measurements
|
21 |
-
"Prefill (s)",
|
22 |
-
"Prefill (s) BetterTransformer",
|
23 |
-
"Decode (tokens/s)",
|
24 |
-
"Decode (tokens/s) BetterTransformer",
|
25 |
-
"End-to-End (tokens/s)",
|
26 |
-
"End-to-End (tokens/s) BetterTransformer",
|
27 |
-
# speedups
|
28 |
-
"Prefill Speedup (%)",
|
29 |
-
"Decode Speedup (%)",
|
30 |
-
]
|
31 |
-
|
32 |
-
|
33 |
-
def get_bt_df(llm_perf_df):
|
34 |
-
copy_df = llm_perf_df.copy()
|
35 |
-
# seperate original model experiments from BetterTransformer experiments
|
36 |
-
original_df = copy_df[(copy_df["Optimization π οΈ"] == "None") & (copy_df["DType π₯"] == "float16")]
|
37 |
-
bt_df = copy_df[(copy_df["Optimization π οΈ"] == "BetterTransformer") & (copy_df["DType π₯"] == "float16")]
|
38 |
-
# merge the two dataframes
|
39 |
-
bt_df = pd.merge(
|
40 |
-
original_df,
|
41 |
-
bt_df,
|
42 |
-
on=["Model π€", "Quantization ποΈ"],
|
43 |
-
suffixes=["", " BetterTransformer"],
|
44 |
-
)
|
45 |
-
# compute speedups
|
46 |
-
bt_df["Prefill Speedup (%)"] = (
|
47 |
-
(bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
|
48 |
-
).round(2) - 100
|
49 |
-
bt_df["Decode Speedup (%)"] = (
|
50 |
-
(bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
|
51 |
-
).round(2) - 100
|
52 |
-
# filter speedups > 1000%
|
53 |
-
bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
|
54 |
-
bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
|
55 |
-
|
56 |
-
return bt_df
|
57 |
-
|
58 |
-
|
59 |
-
def get_bt_prefill_fig(llm_perf_df):
|
60 |
-
bt_df = get_bt_df(llm_perf_df)
|
61 |
-
# plot
|
62 |
-
prefill_fig = px.box(
|
63 |
-
bt_df,
|
64 |
-
x="Architecture ποΈ",
|
65 |
-
y="Prefill Speedup (%)",
|
66 |
-
color_discrete_sequence=px.colors.qualitative.Light24,
|
67 |
-
custom_data=BETTERTRANSFORMER_DATA,
|
68 |
-
color="Quantization ποΈ",
|
69 |
-
points="all",
|
70 |
-
)
|
71 |
-
# add hover data
|
72 |
-
prefill_fig.update_traces(
|
73 |
-
hovertemplate="<br>".join(
|
74 |
-
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
|
75 |
-
)
|
76 |
-
)
|
77 |
-
# add layout
|
78 |
-
prefill_fig.update_layout(
|
79 |
-
title={
|
80 |
-
"text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
|
81 |
-
"y": 0.95,
|
82 |
-
"x": 0.5,
|
83 |
-
"xanchor": "center",
|
84 |
-
"yanchor": "top",
|
85 |
-
},
|
86 |
-
xaxis_title="LLM Architecture",
|
87 |
-
yaxis_title="Prefill Speedup (%)",
|
88 |
-
legend_title="Quantization Scheme",
|
89 |
-
width=1200,
|
90 |
-
height=600,
|
91 |
-
)
|
92 |
-
|
93 |
-
return prefill_fig
|
94 |
-
|
95 |
-
|
96 |
-
def get_bt_decode_fig(llm_perf_df):
|
97 |
-
bt_df = get_bt_df(llm_perf_df)
|
98 |
-
# plot
|
99 |
-
decode_fig = px.box(
|
100 |
-
bt_df,
|
101 |
-
x="Architecture ποΈ",
|
102 |
-
y="Decode Speedup (%)",
|
103 |
-
color_discrete_sequence=px.colors.qualitative.Light24,
|
104 |
-
custom_data=BETTERTRANSFORMER_DATA,
|
105 |
-
color="Quantization ποΈ",
|
106 |
-
points="all",
|
107 |
-
)
|
108 |
-
# add hover data
|
109 |
-
decode_fig.update_traces(
|
110 |
-
hovertemplate="<br>".join(
|
111 |
-
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
|
112 |
-
)
|
113 |
-
)
|
114 |
-
# add layout
|
115 |
-
decode_fig.update_layout(
|
116 |
-
title={
|
117 |
-
"text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
|
118 |
-
"y": 0.95,
|
119 |
-
"x": 0.5,
|
120 |
-
"xanchor": "center",
|
121 |
-
"yanchor": "top",
|
122 |
-
},
|
123 |
-
xaxis_title="LLM Architecture",
|
124 |
-
yaxis_title="Decode Speedup (%)",
|
125 |
-
legend_title="Quantization Scheme",
|
126 |
-
width=1200,
|
127 |
-
height=600,
|
128 |
-
)
|
129 |
-
|
130 |
-
return decode_fig
|
131 |
-
|
132 |
-
|
133 |
-
def create_bt_plots(llm_perf_df):
|
134 |
-
# descriptive text
|
135 |
-
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
136 |
-
# get figures
|
137 |
-
prefill_fig = get_bt_prefill_fig(llm_perf_df)
|
138 |
-
decode_fig = get_bt_decode_fig(llm_perf_df)
|
139 |
-
|
140 |
-
# create plots
|
141 |
-
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
|
142 |
-
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
143 |
-
|
144 |
-
return prefill_plot, decode_plot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/content.py
CHANGED
@@ -14,7 +14,7 @@ configuration for automated benchmarking:
|
|
14 |
|
15 |
- Model evaluation requests should be made in the
|
16 |
[π€ Open LLM Leaderboard π
](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
|
17 |
-
we scrape the list of
|
18 |
- Hardware/Backend/Optimization configuration requests should be made in the
|
19 |
[π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
|
20 |
[Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
|
@@ -23,7 +23,7 @@ we scrape the list of pretrained base models from there.
|
|
23 |
|
24 |
- To avoid communication-dependent results, only one GPU is used.
|
25 |
- Score is the average evaluation score obtained from the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
26 |
-
- LLMs are running on a singleton batch with a prompt size of 256 and generating a
|
27 |
- Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
|
28 |
- We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
|
29 |
|
|
|
14 |
|
15 |
- Model evaluation requests should be made in the
|
16 |
[π€ Open LLM Leaderboard π
](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
|
17 |
+
we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
|
18 |
- Hardware/Backend/Optimization configuration requests should be made in the
|
19 |
[π€ LLM-Perf Leaderboard ποΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
|
20 |
[Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
|
|
|
23 |
|
24 |
- To avoid communication-dependent results, only one GPU is used.
|
25 |
- Score is the average evaluation score obtained from the [π€ Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
|
26 |
+
- LLMs are running on a singleton batch with a prompt size of 256 and generating a 64 tokens for at least 10 iterations and 10 seconds.
|
27 |
- Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
|
28 |
- We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
|
29 |
|
src/control_panel.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
from src.llm_perf import get_llm_perf_df
|
4 |
from src.leaderboard import get_leaderboard_df
|
5 |
-
from src.
|
6 |
-
|
7 |
-
from
|
8 |
-
from
|
|
|
9 |
|
10 |
|
11 |
-
def create_control_panel(machine: str
|
12 |
# controls
|
13 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
@@ -29,7 +29,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
29 |
value=80 * 1024,
|
30 |
elem_id="memory-slider",
|
31 |
)
|
32 |
-
with gr.Column(scale=1):
|
33 |
backend_checkboxes = gr.CheckboxGroup(
|
34 |
label="Backends π",
|
35 |
choices=["pytorch"],
|
@@ -40,7 +40,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
40 |
with gr.Row():
|
41 |
with gr.Column(scale=1, variant="panel"):
|
42 |
datatype_checkboxes = gr.CheckboxGroup(
|
43 |
-
label="
|
44 |
choices=["float32", "float16", "bfloat16"],
|
45 |
value=["float32", "float16", "bfloat16"],
|
46 |
info="βοΈ Select the load data types",
|
@@ -48,13 +48,13 @@ def create_control_panel(machine: str = "hf-dgx-01"):
|
|
48 |
)
|
49 |
with gr.Column(scale=1, variant="panel"):
|
50 |
optimization_checkboxes = gr.CheckboxGroup(
|
51 |
-
label="
|
52 |
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
53 |
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
54 |
info="βοΈ Select the optimization",
|
55 |
elem_id="optimization-checkboxes",
|
56 |
)
|
57 |
-
with gr.Column(scale=2):
|
58 |
quantization_checkboxes = gr.CheckboxGroup(
|
59 |
label="Quantizations ποΈ",
|
60 |
choices=[
|
@@ -118,29 +118,29 @@ def filter_fn(
|
|
118 |
# raw_df["Model π€"].str.contains(model, case=False)
|
119 |
raw_df["Backend π"].isin(backends)
|
120 |
& raw_df["DType π₯"].isin(datatypes)
|
121 |
-
& raw_df["
|
122 |
& raw_df["Quantization ποΈ"].isin(quantizations)
|
123 |
& (raw_df["Open LLM Score (%)"] >= score)
|
124 |
& (raw_df["Allocated Memory (MB)"] <= memory)
|
125 |
]
|
126 |
filtered_leaderboard_df = select_fn(machine, columns, search)
|
127 |
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
128 |
-
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
129 |
-
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
130 |
-
filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
|
131 |
-
filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
|
132 |
-
filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
|
133 |
-
filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
|
134 |
|
135 |
return [
|
136 |
filtered_leaderboard_df,
|
137 |
filtered_lat_score_mem_fig,
|
138 |
-
filtered_bt_prefill_fig,
|
139 |
-
filtered_bt_decode_fig,
|
140 |
-
filtered_fa2_prefill_fig,
|
141 |
-
filtered_fa2_decode_fig,
|
142 |
-
filtered_quant_prefill_fig,
|
143 |
-
filtered_quant_decode_fig,
|
144 |
]
|
145 |
|
146 |
|
@@ -162,12 +162,12 @@ def create_control_callback(
|
|
162 |
# outputs
|
163 |
leaderboard_table,
|
164 |
lat_score_mem_plot,
|
165 |
-
|
166 |
-
|
167 |
-
fa2_prefill_plot,
|
168 |
-
fa2_decode_plot,
|
169 |
-
quant_prefill_plot,
|
170 |
-
quant_decode_plot,
|
171 |
):
|
172 |
filter_button.click(
|
173 |
fn=filter_fn,
|
@@ -188,19 +188,19 @@ def create_control_callback(
|
|
188 |
outputs=[
|
189 |
leaderboard_table,
|
190 |
lat_score_mem_plot,
|
191 |
-
|
192 |
-
|
193 |
-
fa2_prefill_plot,
|
194 |
-
fa2_decode_plot,
|
195 |
-
quant_prefill_plot,
|
196 |
-
quant_decode_plot,
|
197 |
],
|
198 |
)
|
199 |
|
200 |
|
201 |
def select_fn(machine, columns, search):
|
202 |
-
|
203 |
-
selected_leaderboard_df = get_leaderboard_df(
|
204 |
selected_leaderboard_df = selected_leaderboard_df[
|
205 |
selected_leaderboard_df["Model π€"].str.contains(search, case=False)
|
206 |
]
|
|
|
1 |
import gradio as gr
|
2 |
|
|
|
3 |
from src.leaderboard import get_leaderboard_df
|
4 |
+
from src.llm_perf import get_llm_perf_df
|
5 |
+
|
6 |
+
# from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig
|
7 |
+
# from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig
|
8 |
+
from src.map import get_lat_score_mem_fig
|
9 |
|
10 |
|
11 |
+
def create_control_panel(machine: str):
|
12 |
# controls
|
13 |
machine_textbox = gr.Textbox(value=machine, visible=False)
|
14 |
with gr.Accordion("Control Panel ποΈ", open=False, elem_id="control-panel"):
|
|
|
29 |
value=80 * 1024,
|
30 |
elem_id="memory-slider",
|
31 |
)
|
32 |
+
with gr.Column(scale=1, variant="panel"):
|
33 |
backend_checkboxes = gr.CheckboxGroup(
|
34 |
label="Backends π",
|
35 |
choices=["pytorch"],
|
|
|
40 |
with gr.Row():
|
41 |
with gr.Column(scale=1, variant="panel"):
|
42 |
datatype_checkboxes = gr.CheckboxGroup(
|
43 |
+
label="DTypes π₯",
|
44 |
choices=["float32", "float16", "bfloat16"],
|
45 |
value=["float32", "float16", "bfloat16"],
|
46 |
info="βοΈ Select the load data types",
|
|
|
48 |
)
|
49 |
with gr.Column(scale=1, variant="panel"):
|
50 |
optimization_checkboxes = gr.CheckboxGroup(
|
51 |
+
label="Attentions ποΈ",
|
52 |
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
53 |
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
54 |
info="βοΈ Select the optimization",
|
55 |
elem_id="optimization-checkboxes",
|
56 |
)
|
57 |
+
with gr.Column(scale=2, variant="panel"):
|
58 |
quantization_checkboxes = gr.CheckboxGroup(
|
59 |
label="Quantizations ποΈ",
|
60 |
choices=[
|
|
|
118 |
# raw_df["Model π€"].str.contains(model, case=False)
|
119 |
raw_df["Backend π"].isin(backends)
|
120 |
& raw_df["DType π₯"].isin(datatypes)
|
121 |
+
& raw_df["Attention ποΈ"].isin(optimizations)
|
122 |
& raw_df["Quantization ποΈ"].isin(quantizations)
|
123 |
& (raw_df["Open LLM Score (%)"] >= score)
|
124 |
& (raw_df["Allocated Memory (MB)"] <= memory)
|
125 |
]
|
126 |
filtered_leaderboard_df = select_fn(machine, columns, search)
|
127 |
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
128 |
+
# filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
129 |
+
# filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
130 |
+
# filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
|
131 |
+
# filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
|
132 |
+
# filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
|
133 |
+
# filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
|
134 |
|
135 |
return [
|
136 |
filtered_leaderboard_df,
|
137 |
filtered_lat_score_mem_fig,
|
138 |
+
# filtered_bt_prefill_fig,
|
139 |
+
# filtered_bt_decode_fig,
|
140 |
+
# filtered_fa2_prefill_fig,
|
141 |
+
# filtered_fa2_decode_fig,
|
142 |
+
# filtered_quant_prefill_fig,
|
143 |
+
# filtered_quant_decode_fig,
|
144 |
]
|
145 |
|
146 |
|
|
|
162 |
# outputs
|
163 |
leaderboard_table,
|
164 |
lat_score_mem_plot,
|
165 |
+
# attn_prefill_plot,
|
166 |
+
# attn_decode_plot,
|
167 |
+
# fa2_prefill_plot,
|
168 |
+
# fa2_decode_plot,
|
169 |
+
# quant_prefill_plot,
|
170 |
+
# quant_decode_plot,
|
171 |
):
|
172 |
filter_button.click(
|
173 |
fn=filter_fn,
|
|
|
188 |
outputs=[
|
189 |
leaderboard_table,
|
190 |
lat_score_mem_plot,
|
191 |
+
# attn_prefill_plot,
|
192 |
+
# attn_decode_plot,
|
193 |
+
# fa2_prefill_plot,
|
194 |
+
# fa2_decode_plot,
|
195 |
+
# quant_prefill_plot,
|
196 |
+
# quant_decode_plot,
|
197 |
],
|
198 |
)
|
199 |
|
200 |
|
201 |
def select_fn(machine, columns, search):
|
202 |
+
llm_perf_df = get_llm_perf_df(machine=machine)
|
203 |
+
selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
|
204 |
selected_leaderboard_df = selected_leaderboard_df[
|
205 |
selected_leaderboard_df["Model π€"].str.contains(search, case=False)
|
206 |
]
|
src/{quantization_kernels.py β kernels.py}
RENAMED
@@ -2,7 +2,6 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import plotly.express as px
|
4 |
|
5 |
-
|
6 |
QUANT_DATA = [
|
7 |
# open llm
|
8 |
"Model π€",
|
@@ -14,9 +13,7 @@ QUANT_DATA = [
|
|
14 |
# deployment settings
|
15 |
"DType π₯",
|
16 |
"Backend π",
|
17 |
-
"Optimization π οΈ",
|
18 |
"Quantization ποΈ",
|
19 |
-
"Optimization π οΈ Custom Kernel",
|
20 |
"Quantization ποΈ Custom Kernel",
|
21 |
# primary measurements
|
22 |
"Prefill (s)",
|
@@ -34,9 +31,8 @@ def get_quant_df(llm_perf_df):
|
|
34 |
# seperate vanilla GPTQ experiments from Custom Kernel experiments
|
35 |
vanilla_df = copy_df[
|
36 |
(copy_df["Backend π"] == "pytorch")
|
37 |
-
& (copy_df["Quantization ποΈ"] == "None")
|
38 |
-
& (copy_df["Optimization π οΈ"] == "None")
|
39 |
& (copy_df["DType π₯"] == "float16")
|
|
|
40 |
]
|
41 |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
42 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
@@ -70,11 +66,12 @@ def get_quant_df(llm_perf_df):
|
|
70 |
# concat the two dataframes row-wise
|
71 |
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
72 |
# compute speedups
|
73 |
-
quant_df["Prefill Speedup (%)"] = (
|
74 |
-
|
75 |
-
) - 100
|
76 |
quant_df["Decode Speedup (%)"] = (
|
77 |
-
(quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"])
|
|
|
78 |
).round(2) - 100
|
79 |
# filter speedups > 1000%
|
80 |
quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
|
@@ -97,7 +94,12 @@ def get_quant_decode_fig(llm_perf_df):
|
|
97 |
)
|
98 |
# add hover data
|
99 |
decode_fig.update_traces(
|
100 |
-
hovertemplate="<br>".join(
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
102 |
# add layout
|
103 |
decode_fig.update_layout(
|
@@ -132,7 +134,12 @@ def get_quant_prefill_fig(llm_perf_df):
|
|
132 |
)
|
133 |
# add hover data
|
134 |
prefill_fig.update_traces(
|
135 |
-
hovertemplate="<br>".join(
|
|
|
|
|
|
|
|
|
|
|
136 |
)
|
137 |
# add layout
|
138 |
prefill_fig.update_layout(
|
@@ -161,7 +168,9 @@ def create_quant_plots(llm_perf_df):
|
|
161 |
decode_fig = get_quant_decode_fig(llm_perf_df)
|
162 |
|
163 |
# create plots
|
164 |
-
prefill_plot = gr.components.Plot(
|
|
|
|
|
165 |
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
166 |
|
167 |
return prefill_plot, decode_plot
|
|
|
2 |
import pandas as pd
|
3 |
import plotly.express as px
|
4 |
|
|
|
5 |
QUANT_DATA = [
|
6 |
# open llm
|
7 |
"Model π€",
|
|
|
13 |
# deployment settings
|
14 |
"DType π₯",
|
15 |
"Backend π",
|
|
|
16 |
"Quantization ποΈ",
|
|
|
17 |
"Quantization ποΈ Custom Kernel",
|
18 |
# primary measurements
|
19 |
"Prefill (s)",
|
|
|
31 |
# seperate vanilla GPTQ experiments from Custom Kernel experiments
|
32 |
vanilla_df = copy_df[
|
33 |
(copy_df["Backend π"] == "pytorch")
|
|
|
|
|
34 |
& (copy_df["DType π₯"] == "float16")
|
35 |
+
& (copy_df["Quantization ποΈ"] == "None")
|
36 |
]
|
37 |
exllamav1_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV1")]
|
38 |
exllamav2_df = copy_df[(copy_df["Quantization ποΈ"] == "GPTQ.4bit+ExllamaV2")]
|
|
|
66 |
# concat the two dataframes row-wise
|
67 |
quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
|
68 |
# compute speedups
|
69 |
+
quant_df["Prefill Speedup (%)"] = (
|
70 |
+
(quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
|
71 |
+
).round(2) - 100
|
72 |
quant_df["Decode Speedup (%)"] = (
|
73 |
+
(quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"])
|
74 |
+
* 100
|
75 |
).round(2) - 100
|
76 |
# filter speedups > 1000%
|
77 |
quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
|
|
|
94 |
)
|
95 |
# add hover data
|
96 |
decode_fig.update_traces(
|
97 |
+
hovertemplate="<br>".join(
|
98 |
+
[
|
99 |
+
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
100 |
+
for i, column in enumerate(QUANT_DATA)
|
101 |
+
]
|
102 |
+
)
|
103 |
)
|
104 |
# add layout
|
105 |
decode_fig.update_layout(
|
|
|
134 |
)
|
135 |
# add hover data
|
136 |
prefill_fig.update_traces(
|
137 |
+
hovertemplate="<br>".join(
|
138 |
+
[
|
139 |
+
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
140 |
+
for i, column in enumerate(QUANT_DATA)
|
141 |
+
]
|
142 |
+
)
|
143 |
)
|
144 |
# add layout
|
145 |
prefill_fig.update_layout(
|
|
|
168 |
decode_fig = get_quant_decode_fig(llm_perf_df)
|
169 |
|
170 |
# create plots
|
171 |
+
prefill_plot = gr.components.Plot(
|
172 |
+
value=prefill_fig, elem_id="plot", show_label=False
|
173 |
+
)
|
174 |
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
175 |
|
176 |
return prefill_plot, decode_plot
|
src/leaderboard.py
CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
|
|
2 |
|
3 |
from src.utils import model_hyperlink, process_score
|
4 |
|
5 |
-
|
6 |
LEADERBOARD_COLUMN_TO_DATATYPE = {
|
7 |
# open llm
|
8 |
"Model π€": "markdown",
|
@@ -13,18 +12,18 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
|
|
13 |
"Memory (MB)": "number",
|
14 |
"Energy (tokens/kWh)": "number",
|
15 |
# deployment settings
|
16 |
-
"DType π₯": "str",
|
17 |
"Backend π": "str",
|
18 |
-
"
|
19 |
"Quantization ποΈ": "str",
|
|
|
|
|
20 |
# additional measurements
|
21 |
-
"
|
22 |
-
"
|
23 |
"Open LLM Score (%)": "number",
|
24 |
"End-to-End (s)": "number",
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"Used Memory (MB)": "number",
|
28 |
}
|
29 |
|
30 |
PRIMARY_COLUMNS = [
|
|
|
2 |
|
3 |
from src.utils import model_hyperlink, process_score
|
4 |
|
|
|
5 |
LEADERBOARD_COLUMN_TO_DATATYPE = {
|
6 |
# open llm
|
7 |
"Model π€": "markdown",
|
|
|
12 |
"Memory (MB)": "number",
|
13 |
"Energy (tokens/kWh)": "number",
|
14 |
# deployment settings
|
|
|
15 |
"Backend π": "str",
|
16 |
+
"Precision π₯": "str",
|
17 |
"Quantization ποΈ": "str",
|
18 |
+
"Attention ποΈ": "str",
|
19 |
+
"Kernel βοΈ": "str",
|
20 |
# additional measurements
|
21 |
+
# "Reserved Memory (MB)": "number",
|
22 |
+
# "Used Memory (MB)": "number",
|
23 |
"Open LLM Score (%)": "number",
|
24 |
"End-to-End (s)": "number",
|
25 |
+
"Architecture ποΈ": "str",
|
26 |
+
"Params (B)": "number",
|
|
|
27 |
}
|
28 |
|
29 |
PRIMARY_COLUMNS = [
|
src/llm_perf.py
CHANGED
@@ -1,123 +1,98 @@
|
|
1 |
import os
|
2 |
|
3 |
import pandas as pd
|
4 |
-
from huggingface_hub import hf_hub_download
|
5 |
|
6 |
-
from .utils import
|
7 |
-
|
8 |
-
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
9 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
10 |
|
11 |
COLUMNS_MAPPING = {
|
12 |
-
"
|
13 |
-
"
|
14 |
# primary measurements
|
15 |
-
"
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
|
|
19 |
# deployment settings
|
20 |
-
"backend.name": "Backend π",
|
21 |
-
"backend.torch_dtype": "
|
22 |
-
"optimization": "Optimization π οΈ",
|
23 |
"quantization": "Quantization ποΈ",
|
24 |
-
|
25 |
-
"
|
26 |
-
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
32 |
}
|
33 |
-
SORTING_COLUMNS = [
|
34 |
-
|
35 |
-
"Decode (tokens/s)",
|
36 |
-
"Prefill (s)",
|
37 |
-
]
|
38 |
SORTING_ASCENDING = [False, True, False]
|
39 |
|
40 |
|
41 |
-
def
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
return llm_df
|
54 |
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
repo_id=LLM_PERF_DATASET_REPO,
|
59 |
-
filename=f"{machine}/perf-report.csv",
|
60 |
-
local_dir="dataset",
|
61 |
-
repo_type="dataset",
|
62 |
-
token=HF_TOKEN,
|
63 |
)
|
64 |
-
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
|
65 |
|
66 |
-
return
|
67 |
|
68 |
|
69 |
-
def
|
70 |
-
# get dataframes
|
71 |
-
llm_df = get_llm_df()
|
72 |
-
perf_df = get_perf_df(machine=machine)
|
73 |
-
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
|
74 |
# some assertions
|
75 |
-
assert llm_perf_df["
|
76 |
-
assert llm_perf_df["
|
77 |
-
assert llm_perf_df["
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
# fix nan values
|
83 |
-
llm_perf_df.loc[
|
84 |
-
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
85 |
-
"generate.energy_consumption(tokens/kWh)",
|
86 |
-
] = pd.NA
|
87 |
-
|
88 |
-
# add optimization column
|
89 |
-
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
|
90 |
-
lambda x: (
|
91 |
-
"BetterTransformer"
|
92 |
-
if x["backend.to_bettertransformer"]
|
93 |
-
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None")
|
94 |
-
),
|
95 |
-
axis=1,
|
96 |
)
|
97 |
-
|
98 |
-
|
99 |
-
[
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
]
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
)
|
113 |
-
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit"))
|
114 |
-
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit"))
|
115 |
-
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit"))
|
116 |
-
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit"))
|
117 |
-
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA"))
|
118 |
-
llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2"))
|
119 |
-
# add arch
|
120 |
-
llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
|
121 |
# filter columns
|
122 |
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
|
123 |
# rename columns
|
@@ -130,3 +105,14 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
|
130 |
)
|
131 |
|
132 |
return llm_perf_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
+
from .utils import process_kernels, process_quantizations
|
|
|
|
|
|
|
6 |
|
7 |
COLUMNS_MAPPING = {
|
8 |
+
"config.name": "Experiment π§ͺ",
|
9 |
+
"config.backend.model": "Model π€",
|
10 |
# primary measurements
|
11 |
+
"report.prefill.latency.p50": "Prefill (s)",
|
12 |
+
"report.per_token.latency.p50": "Per Token (s)",
|
13 |
+
"report.decode.throughput.value": "Decode (tokens/s)",
|
14 |
+
"report.decode.efficiency.value": "Energy (tokens/kWh)",
|
15 |
+
"report.decode.memory.max_allocated": "Memory (MB)",
|
16 |
# deployment settings
|
17 |
+
"config.backend.name": "Backend π",
|
18 |
+
"config.backend.torch_dtype": "Precision π₯",
|
|
|
19 |
"quantization": "Quantization ποΈ",
|
20 |
+
"attention": "Attention ποΈ",
|
21 |
+
"kernel": "Kernel βοΈ",
|
22 |
+
# additional information
|
23 |
+
"architecture": "Architecture ποΈ",
|
24 |
+
"prefill+decode": "End-to-End (s)",
|
25 |
+
"Average β¬οΈ": "Open LLM Score (%)",
|
26 |
+
"#Params (B)": "Params (B)",
|
|
|
27 |
}
|
28 |
+
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
|
29 |
+
SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
|
|
|
|
|
|
|
30 |
SORTING_ASCENDING = [False, True, False]
|
31 |
|
32 |
|
33 |
+
def get_raw_llm_perf_df(machine: str = "1xA10"):
|
34 |
+
dfs = []
|
35 |
+
for subset in SUBSETS:
|
36 |
+
try:
|
37 |
+
dfs.append(
|
38 |
+
pd.read_csv(
|
39 |
+
f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-perf-leaderboard-{subset}-{machine}.csv"
|
40 |
+
)
|
41 |
+
)
|
42 |
+
except Exception:
|
43 |
+
print(f"Subset {subset} for machine {machine} not found")
|
|
|
|
|
44 |
|
45 |
+
llm_perf_df = pd.concat(dfs)
|
46 |
+
open_llm_df = pd.read_csv(
|
47 |
+
"hf://datasets/optimum-benchmark/open-llm-leaderboard/open-llm-leaderboard.csv"
|
48 |
+
)
|
49 |
|
50 |
+
llm_perf_df = pd.merge(
|
51 |
+
open_llm_df, llm_perf_df, left_on="Model", right_on="config.backend.model"
|
|
|
|
|
|
|
|
|
|
|
52 |
)
|
|
|
53 |
|
54 |
+
return llm_perf_df
|
55 |
|
56 |
|
57 |
+
def processed_llm_perf_df(llm_perf_df):
|
|
|
|
|
|
|
|
|
58 |
# some assertions
|
59 |
+
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
|
60 |
+
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
|
61 |
+
assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
|
62 |
+
assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
|
63 |
+
# fix couple stuff
|
64 |
+
llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
|
65 |
+
"flash_attention_2", "fa2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
)
|
67 |
+
llm_perf_df["prefill+decode"] = (
|
68 |
+
llm_perf_df["report.prefill.latency.p50"]
|
69 |
+
+ (llm_perf_df["report.decode.latency.p50"])
|
70 |
+
)
|
71 |
+
# llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
|
72 |
+
# process_architectures
|
73 |
+
# )
|
74 |
+
llm_perf_df["architecture"] = llm_perf_df["Architecture"]
|
75 |
+
llm_perf_df["attention"] = (
|
76 |
+
llm_perf_df["config.backend.attn_implementation"]
|
77 |
+
.str.replace("flash_attention_2", "FAv2")
|
78 |
+
.str.replace("eager", "Eager")
|
79 |
+
.str.replace("sdpa", "SDPA")
|
80 |
+
)
|
81 |
+
llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
|
82 |
+
llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
|
83 |
+
# round numerical columns
|
84 |
+
llm_perf_df = llm_perf_df.round(
|
85 |
+
{
|
86 |
+
"report.prefill.latency.p50": 3,
|
87 |
+
"report.decode.latency.p50": 3,
|
88 |
+
"report.decode.throughput.value": 3,
|
89 |
+
"report.decode.efficiency.value": 3,
|
90 |
+
"report.decode.memory.max_allocated": 3,
|
91 |
+
"Average β¬οΈ": 3,
|
92 |
+
"prefill+decode": 3,
|
93 |
+
"#Params (B)": 3,
|
94 |
+
}
|
95 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
# filter columns
|
97 |
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
|
98 |
# rename columns
|
|
|
105 |
)
|
106 |
|
107 |
return llm_perf_df
|
108 |
+
|
109 |
+
|
110 |
+
def get_llm_perf_df(machine: str = "1xA10"):
|
111 |
+
if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
|
112 |
+
llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
|
113 |
+
else:
|
114 |
+
llm_perf_df = get_raw_llm_perf_df(machine)
|
115 |
+
llm_perf_df = processed_llm_perf_df(llm_perf_df)
|
116 |
+
llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
|
117 |
+
|
118 |
+
return llm_perf_df
|
src/{latency_score_memory.py β map.py}
RENAMED
@@ -1,20 +1,20 @@
|
|
1 |
import gradio as gr
|
2 |
import plotly.express as px
|
3 |
|
4 |
-
|
5 |
SCORE_MEMORY_LATENCY_DATA = [
|
6 |
"Model π€",
|
7 |
-
"DType π₯",
|
8 |
"Backend π",
|
|
|
9 |
"Params (B)",
|
10 |
-
"Architecture ποΈ",
|
11 |
-
"Optimization π οΈ",
|
12 |
"Quantization ποΈ",
|
|
|
|
|
13 |
"Open LLM Score (%)",
|
14 |
"Prefill (s)",
|
15 |
"Decode (tokens/s)",
|
16 |
"Memory (MB)",
|
17 |
"End-to-End (s)",
|
|
|
18 |
]
|
19 |
|
20 |
|
@@ -32,7 +32,10 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
32 |
)
|
33 |
fig.update_traces(
|
34 |
hovertemplate="<br>".join(
|
35 |
-
[
|
|
|
|
|
|
|
36 |
)
|
37 |
)
|
38 |
fig.update_layout(
|
@@ -43,7 +46,7 @@ def get_lat_score_mem_fig(llm_perf_df):
|
|
43 |
"xanchor": "center",
|
44 |
"yanchor": "top",
|
45 |
},
|
46 |
-
xaxis_title="Time To Generate
|
47 |
yaxis_title="Open LLM Score (%)",
|
48 |
legend_title="LLM Architecture",
|
49 |
width=1200,
|
|
|
1 |
import gradio as gr
|
2 |
import plotly.express as px
|
3 |
|
|
|
4 |
SCORE_MEMORY_LATENCY_DATA = [
|
5 |
"Model π€",
|
|
|
6 |
"Backend π",
|
7 |
+
"Precision π₯",
|
8 |
"Params (B)",
|
|
|
|
|
9 |
"Quantization ποΈ",
|
10 |
+
"Attention ποΈ",
|
11 |
+
"Kernel βοΈ",
|
12 |
"Open LLM Score (%)",
|
13 |
"Prefill (s)",
|
14 |
"Decode (tokens/s)",
|
15 |
"Memory (MB)",
|
16 |
"End-to-End (s)",
|
17 |
+
"Architecture ποΈ",
|
18 |
]
|
19 |
|
20 |
|
|
|
32 |
)
|
33 |
fig.update_traces(
|
34 |
hovertemplate="<br>".join(
|
35 |
+
[
|
36 |
+
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
37 |
+
for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)
|
38 |
+
]
|
39 |
)
|
40 |
)
|
41 |
fig.update_layout(
|
|
|
46 |
"xanchor": "center",
|
47 |
"yanchor": "top",
|
48 |
},
|
49 |
+
xaxis_title="Time To Generate 64 Tokens (s)",
|
50 |
yaxis_title="Open LLM Score (%)",
|
51 |
legend_title="LLM Architecture",
|
52 |
width=1200,
|
src/utils.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
LLM_MODEL_ARCHS = {
|
2 |
"stablelm_epoch": "π΄ StableLM-Epoch",
|
3 |
"stablelm_alpha": "π΄ StableLM-Alpha",
|
@@ -16,16 +18,14 @@ LLM_MODEL_ARCHS = {
|
|
16 |
"llama": "π¦ LLaMA",
|
17 |
"rwkv": "π¦ββ¬ RWKV",
|
18 |
"deci": "π΅ deci",
|
19 |
-
"Yi": "π« Yi δΊΊ",
|
20 |
"mpt": "𧱠MPT",
|
21 |
# suggest something
|
22 |
"gpt_neox": "GPT-NeoX",
|
23 |
"gpt_neo": "GPT-Neo",
|
24 |
"gpt2": "GPT-2",
|
25 |
"gptj": "GPT-J",
|
26 |
-
"xglm": "XGLM",
|
27 |
"bart": "BART",
|
28 |
-
"opt": "OPT",
|
29 |
}
|
30 |
|
31 |
|
@@ -33,11 +33,13 @@ def model_hyperlink(link, model_name):
|
|
33 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
34 |
|
35 |
|
36 |
-
def
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
return
|
|
|
|
|
41 |
|
42 |
|
43 |
def process_score(score, quantization):
|
@@ -47,25 +49,53 @@ def process_score(score, quantization):
|
|
47 |
return f"{score:.2f} "
|
48 |
|
49 |
|
50 |
-
def
|
51 |
-
if
|
|
|
|
|
|
|
52 |
return "BnB.4bit"
|
53 |
-
elif
|
54 |
-
|
55 |
-
|
56 |
-
x["backend.quantization_config.exllama_config.version"] == 1
|
57 |
):
|
58 |
-
return "
|
59 |
-
elif (
|
60 |
-
x["backend.
|
|
|
61 |
):
|
62 |
-
return "GPTQ.4bit+ExllamaV2"
|
63 |
-
elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
|
64 |
return "GPTQ.4bit"
|
65 |
-
elif
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
else:
|
70 |
return "None"
|
71 |
|
|
|
1 |
+
from transformers import AutoConfig
|
2 |
+
|
3 |
LLM_MODEL_ARCHS = {
|
4 |
"stablelm_epoch": "π΄ StableLM-Epoch",
|
5 |
"stablelm_alpha": "π΄ StableLM-Alpha",
|
|
|
18 |
"llama": "π¦ LLaMA",
|
19 |
"rwkv": "π¦ββ¬ RWKV",
|
20 |
"deci": "π΅ deci",
|
21 |
+
"Yi": "π« Yi δΊΊ", # people
|
22 |
"mpt": "𧱠MPT",
|
23 |
# suggest something
|
24 |
"gpt_neox": "GPT-NeoX",
|
25 |
"gpt_neo": "GPT-Neo",
|
26 |
"gpt2": "GPT-2",
|
27 |
"gptj": "GPT-J",
|
|
|
28 |
"bart": "BART",
|
|
|
29 |
}
|
30 |
|
31 |
|
|
|
33 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
34 |
|
35 |
|
36 |
+
def process_architectures(model):
|
37 |
+
# return "Unknown"
|
38 |
+
try:
|
39 |
+
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
|
40 |
+
return LLM_MODEL_ARCHS.get(config.model_type, "Unknown")
|
41 |
+
except Exception:
|
42 |
+
return "Unknown"
|
43 |
|
44 |
|
45 |
def process_score(score, quantization):
|
|
|
49 |
return f"{score:.2f} "
|
50 |
|
51 |
|
52 |
+
def process_quantizations(x):
|
53 |
+
if (
|
54 |
+
x["config.backend.quantization_scheme"] == "bnb"
|
55 |
+
and x["config.backend.quantization_config.load_in_4bit"] is True
|
56 |
+
):
|
57 |
return "BnB.4bit"
|
58 |
+
elif (
|
59 |
+
x["config.backend.quantization_scheme"] == "bnb"
|
60 |
+
and x["config.backend.quantization_config.load_in_8bit"] is True
|
|
|
61 |
):
|
62 |
+
return "BnB.8bit"
|
63 |
+
elif (
|
64 |
+
x["config.backend.quantization_scheme"] == "gptq"
|
65 |
+
and x["config.backend.quantization_config.bits"] == 4
|
66 |
):
|
|
|
|
|
67 |
return "GPTQ.4bit"
|
68 |
+
elif (
|
69 |
+
x["config.backend.quantization_scheme"] == "awq"
|
70 |
+
and x["config.backend.quantization_config.bits"] == 4
|
71 |
+
):
|
72 |
+
return "AWQ.4bit"
|
73 |
+
else:
|
74 |
+
return "None"
|
75 |
+
|
76 |
+
|
77 |
+
def process_kernels(x):
|
78 |
+
if (
|
79 |
+
x["config.backend.quantization_scheme"] == "gptq"
|
80 |
+
and x["config.backend.quantization_config.version"] == 1
|
81 |
+
):
|
82 |
+
return "GPTQ.ExllamaV1"
|
83 |
+
|
84 |
+
elif (
|
85 |
+
x["config.backend.quantization_scheme"] == "gptq"
|
86 |
+
and x["config.backend.quantization_config.version"] == 2
|
87 |
+
):
|
88 |
+
return "GPTQ.ExllamaV2"
|
89 |
+
elif (
|
90 |
+
x["config.backend.quantization_scheme"] == "awq"
|
91 |
+
and x["config.backend.quantization_config.version"] == "gemm"
|
92 |
+
):
|
93 |
+
return "AWQ.GEMM"
|
94 |
+
elif (
|
95 |
+
x["config.backend.quantization_scheme"] == "awq"
|
96 |
+
and x["config.backend.quantization_config.version"] == "gemv"
|
97 |
+
):
|
98 |
+
return "AWQ.GEMV"
|
99 |
else:
|
100 |
return "None"
|
101 |
|