IlyasMoutawwakil HF staff commited on
Commit
76b423c
Β·
1 Parent(s): 57896bb
app.py CHANGED
@@ -1,26 +1,25 @@
1
- import os
2
-
3
  import gradio as gr
4
 
5
- from src.control_panel import create_control_panel, create_control_callback, create_select_callback
6
- from src.latency_score_memory import create_lat_score_mem_plot
7
- from src.quantization_kernels import create_quant_plots
8
- from src.leaderboard import create_leaderboard_table
9
- from src.bettertransformer import create_bt_plots
10
- from src.flashattentionv2 import create_fa2_plots
11
- from src.llm_perf import get_llm_perf_df
12
  from src.assets import custom_css
13
- from src.content import (
14
- LOGO,
15
- TITLE,
16
- ABOUT,
17
- CITATION_BUTTON,
18
- CITATION_BUTTON_LABEL,
 
19
  )
 
 
 
20
 
 
21
 
22
- MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB-275W πŸ–₯️", "audace": "RTX4090-24GB-450W πŸ’»"}
23
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
 
 
24
 
25
 
26
  demo = gr.Blocks(css=custom_css)
@@ -41,22 +40,27 @@ with demo:
41
  datatype_checkboxes,
42
  optimization_checkboxes,
43
  quantization_checkboxes,
44
- ) = create_control_panel()
45
  ####################### HARDWARE SUBTABS #######################
46
  with gr.Tabs(elem_classes="subtabs"):
47
- llm_perf_df = get_llm_perf_df(machine=machine)
48
  ####################### LEADERBOARD TAB #######################
49
  with gr.TabItem("Leaderboard πŸ…", id=0):
50
- search_bar, columns_checkboxes, leaderboard_table = create_leaderboard_table(llm_perf_df)
 
 
51
  with gr.TabItem("Find Your Best Model 🧭", id=1):
52
- lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
53
- ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
54
- with gr.TabItem("ScaledDotProductAttention πŸ“ˆ", id=2):
55
- bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
56
- with gr.TabItem("FlashAttentionV2 πŸ“ˆ", id=3):
57
- fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
58
- with gr.TabItem("Quantization Kernels πŸ“ˆ", id=4):
59
- quant_prefill_plot, quant_decode_plot = create_quant_plots(llm_perf_df)
 
 
 
60
 
61
  ####################### CONTROL CALLBACK #######################
62
  create_control_callback(
@@ -75,12 +79,10 @@ with demo:
75
  # outputs
76
  leaderboard_table,
77
  lat_score_mem_plot,
78
- bt_prefill_plot,
79
- bt_decode_plot,
80
- fa2_prefill_plot,
81
- fa2_decode_plot,
82
- quant_prefill_plot,
83
- quant_decode_plot,
84
  )
85
 
86
  create_select_callback(
 
 
 
1
  import gradio as gr
2
 
 
 
 
 
 
 
 
3
  from src.assets import custom_css
4
+
5
+ # from src.attention import create_attn_plots
6
+ from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
7
+ from src.control_panel import (
8
+ create_control_callback,
9
+ create_control_panel,
10
+ create_select_callback,
11
  )
12
+ from src.leaderboard import create_leaderboard_table
13
+ from src.llm_perf import get_llm_perf_df
14
+ from src.map import create_lat_score_mem_plot
15
 
16
+ # from custom_kernels import create_quant_krnl_plots
17
 
18
+ MACHINE_TO_HARDWARE = {
19
+ "1xA10": "A10-24GB-150W πŸ–₯️",
20
+ "1xA100": "A100-80GB-275W πŸ–₯️",
21
+ # "1xH100": "H100-80GB-700W πŸ–₯️",
22
+ }
23
 
24
 
25
  demo = gr.Blocks(css=custom_css)
 
40
  datatype_checkboxes,
41
  optimization_checkboxes,
42
  quantization_checkboxes,
43
+ ) = create_control_panel(machine=machine)
44
  ####################### HARDWARE SUBTABS #######################
45
  with gr.Tabs(elem_classes="subtabs"):
46
+ open_llm_perf_df = get_llm_perf_df(machine=machine)
47
  ####################### LEADERBOARD TAB #######################
48
  with gr.TabItem("Leaderboard πŸ…", id=0):
49
+ search_bar, columns_checkboxes, leaderboard_table = (
50
+ create_leaderboard_table(open_llm_perf_df)
51
+ )
52
  with gr.TabItem("Find Your Best Model 🧭", id=1):
53
+ lat_score_mem_plot = create_lat_score_mem_plot(open_llm_perf_df)
54
+ ###################### ATTENTIONS SPEEDUP TAB #######################
55
+ # with gr.TabItem("Attention πŸ“ˆ", id=2):
56
+ # attn_prefill_plot, attn_decode_plot = create_attn_plots(
57
+ # open_llm_perf_df
58
+ # )
59
+ # ####################### KERNELS SPEEDUP TAB #######################
60
+ # with gr.TabItem("Kernels πŸ“ˆ", id=4):
61
+ # quant_krnl_prefill_plot, quant_krnl_decode_plot = (
62
+ # create_quant_krnl_plots(llm_perf_df)
63
+ # )
64
 
65
  ####################### CONTROL CALLBACK #######################
66
  create_control_callback(
 
79
  # outputs
80
  leaderboard_table,
81
  lat_score_mem_plot,
82
+ # attn_prefill_plot,
83
+ # attn_decode_plot,
84
+ # quant_krnl_prefill_plot,
85
+ # quant_krnl_decode_plot,
 
 
86
  )
87
 
88
  create_select_callback(
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  huggingface_hub
 
2
  gradio
3
  plotly
4
  pandas
 
1
  huggingface_hub
2
+ transformers
3
  gradio
4
  plotly
5
  pandas
src/{flashattentionv2.py β†’ attention.py} RENAMED
@@ -2,143 +2,156 @@ import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
 
5
-
6
- FLASHATTENTIONV2_DATA = [
7
  # open llm
8
  "Model πŸ€—",
9
- "DType πŸ“₯",
10
- "Backend 🏭",
11
  "Params (B)",
12
  "Architecture πŸ›οΈ",
13
  "Open LLM Score (%)",
14
  # deployment settings
15
- "DType πŸ“₯",
16
  "Backend 🏭",
17
- "Optimization πŸ› οΈ",
18
  "Quantization πŸ—œοΈ",
19
- "Optimization πŸ› οΈ FlashAttentionV2",
 
 
20
  # primary measurements
21
  "Prefill (s)",
22
- "Prefill (s) FlashAttentionV2",
23
  "Decode (tokens/s)",
24
- "Decode (tokens/s) FlashAttentionV2",
25
- "End-to-End (tokens/s)",
26
- "End-to-End (tokens/s) FlashAttentionV2",
27
  # speedups
28
  "Prefill Speedup (%)",
29
  "Decode Speedup (%)",
30
  ]
31
 
32
 
33
- def get_fa2_df(llm_perf_df):
34
- copy_df = llm_perf_df.copy()
35
- # seperate original model experiments from FlashAttentionV2 experiments
36
- original_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "None") & (copy_df["DType πŸ“₯"] == "float16")]
37
- fa2_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "FlashAttentionV2") & (copy_df["DType πŸ“₯"] == "float16")]
38
- # merge the two dataframes
 
 
 
 
 
 
 
 
 
 
39
  fa2_df = pd.merge(
40
- original_df,
41
  fa2_df,
42
- on=["Model πŸ€—", "Quantization πŸ—œοΈ"],
43
- suffixes=["", " FlashAttentionV2"],
44
  )
 
 
 
45
  # compute speedups
46
- fa2_df["Prefill Speedup (%)"] = ((fa2_df["Prefill (s)"] / fa2_df["Prefill (s) FlashAttentionV2"]) * 100).round(
47
- 2
48
- ) - 100
49
- fa2_df["Decode Speedup (%)"] = (
50
- (fa2_df["Decode (tokens/s) FlashAttentionV2"] / fa2_df["Decode (tokens/s)"]) * 100
51
  ).round(2) - 100
52
- # filter speedups > 1000%
53
- fa2_df = fa2_df[fa2_df["Prefill Speedup (%)"] < 1000]
54
- fa2_df = fa2_df[fa2_df["Decode Speedup (%)"] < 1000]
55
 
56
- return fa2_df
57
 
58
 
59
- def get_fa2_decode_fig(llm_perf_df):
60
- fa2_df = get_fa2_df(llm_perf_df)
61
  # plot
62
- decode_fig = px.box(
63
- fa2_df,
64
  x="Architecture πŸ›οΈ",
65
- y="Decode Speedup (%)",
66
  color_discrete_sequence=px.colors.qualitative.Light24,
67
- custom_data=FLASHATTENTIONV2_DATA,
68
- color="Quantization πŸ—œοΈ",
69
  points="all",
70
  )
71
  # add hover data
72
- decode_fig.update_traces(
73
  hovertemplate="<br>".join(
74
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
 
 
 
75
  )
76
  )
77
  # add layout
78
- decode_fig.update_layout(
79
  title={
80
- "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
81
- "y": 0.95,
82
- "x": 0.5,
83
  "xanchor": "center",
84
  "yanchor": "top",
 
 
85
  },
 
86
  xaxis_title="LLM Architecture",
87
- yaxis_title="Decode Speedup (%)",
88
- legend_title="Quantization Scheme",
89
  width=1200,
90
  height=600,
91
  )
92
 
93
- return decode_fig
94
 
95
 
96
- def get_fa2_prefill_fig(llm_perf_df):
97
- fa2_df = get_fa2_df(llm_perf_df)
 
98
  # plot
99
- prefill_fig = px.box(
100
- fa2_df,
101
  x="Architecture πŸ›οΈ",
102
- y="Prefill Speedup (%)",
103
  color_discrete_sequence=px.colors.qualitative.Light24,
104
- custom_data=FLASHATTENTIONV2_DATA,
105
- color="Quantization πŸ—œοΈ",
106
  points="all",
107
  )
108
  # add hover data
109
- prefill_fig.update_traces(
110
  hovertemplate="<br>".join(
111
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
 
 
 
112
  )
113
  )
114
  # add layout
115
- prefill_fig.update_layout(
116
  title={
117
- "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
118
- "y": 0.95,
119
- "x": 0.5,
120
  "xanchor": "center",
121
  "yanchor": "top",
 
 
122
  },
 
123
  xaxis_title="LLM Architecture",
124
- yaxis_title="Prefill Speedup (%)",
125
- legend_title="Quantization Scheme",
126
  width=1200,
127
  height=600,
128
  )
129
 
130
- return prefill_fig
131
 
132
 
133
- def create_fa2_plots(llm_perf_df):
134
  # descriptive text
135
  gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
136
  # get figures
137
- prefill_fig = get_fa2_prefill_fig(llm_perf_df)
138
- decode_fig = get_fa2_decode_fig(llm_perf_df)
139
 
140
  # create plots
141
- prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
 
 
142
  decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
143
 
144
  return prefill_plot, decode_plot
 
2
  import pandas as pd
3
  import plotly.express as px
4
 
5
+ ATTN_DATA = [
 
6
  # open llm
7
  "Model πŸ€—",
8
+ "Experiment πŸ§ͺ",
 
9
  "Params (B)",
10
  "Architecture πŸ›οΈ",
11
  "Open LLM Score (%)",
12
  # deployment settings
 
13
  "Backend 🏭",
 
14
  "Quantization πŸ—œοΈ",
15
+ "Precision πŸ“₯",
16
+ "Attention πŸ‘οΈ",
17
+ "Kernel βš›οΈ",
18
  # primary measurements
19
  "Prefill (s)",
 
20
  "Decode (tokens/s)",
 
 
 
21
  # speedups
22
  "Prefill Speedup (%)",
23
  "Decode Speedup (%)",
24
  ]
25
 
26
 
27
+ def get_attn_df(open_llm_perf_df):
28
+ copy_df = open_llm_perf_df.copy()
29
+ copy_df["Quantization & Kernel"] = (
30
+ copy_df["Quantization πŸ—œοΈ"] + " & " + copy_df["Kernel βš›οΈ"]
31
+ )
32
+
33
+ eager_df = copy_df[(copy_df["Attention πŸ‘οΈ"] == "Eager")]
34
+ sdpa_df = copy_df[(copy_df["Attention πŸ‘οΈ"] == "SDPA")]
35
+ fa2_df = copy_df[(copy_df["Attention πŸ‘οΈ"] == "FAv2")]
36
+
37
+ sdpa_df = pd.merge(
38
+ eager_df,
39
+ sdpa_df,
40
+ on=["Model πŸ€—", "Quantization & Kernel"],
41
+ suffixes=["", " other"],
42
+ )
43
  fa2_df = pd.merge(
44
+ eager_df,
45
  fa2_df,
46
+ on=["Model πŸ€—", "Quantization & Kernel"],
47
+ suffixes=["", " other"],
48
  )
49
+
50
+ attn_df = pd.concat([sdpa_df, fa2_df])
51
+
52
  # compute speedups
53
+ attn_df["Prefill Speedup (%)"] = (
54
+ (attn_df["Prefill (s)"] / attn_df["Prefill (s) other"]) * 100
55
+ ).round(2) - 100
56
+ attn_df["Decode Speedup (%)"] = (
57
+ (attn_df["Decode (tokens/s) other"] / attn_df["Decode (tokens/s)"]) * 100
58
  ).round(2) - 100
 
 
 
59
 
60
+ return attn_df
61
 
62
 
63
+ def get_attn_prefill_fig(open_llm_perf_df):
64
+ attn_df = get_attn_df(open_llm_perf_df)
65
  # plot
66
+ prefill_fig = px.box(
67
+ attn_df,
68
  x="Architecture πŸ›οΈ",
69
+ y="Prefill Speedup (%)",
70
  color_discrete_sequence=px.colors.qualitative.Light24,
71
+ custom_data=ATTN_DATA,
72
+ color="Attention πŸ‘οΈ other",
73
  points="all",
74
  )
75
  # add hover data
76
+ prefill_fig.update_traces(
77
  hovertemplate="<br>".join(
78
+ [
79
+ f"<b>{column}:</b> %{{customdata[{i}]}}"
80
+ for i, column in enumerate(ATTN_DATA)
81
+ ]
82
  )
83
  )
84
  # add layout
85
+ prefill_fig.update_layout(
86
  title={
87
+ "text": "Prefill Speedup per Architecture, Compared To Eager Attention",
 
 
88
  "xanchor": "center",
89
  "yanchor": "top",
90
+ "y": 0.95,
91
+ "x": 0.5,
92
  },
93
+ yaxis_title="Prefill Speedup (%)",
94
  xaxis_title="LLM Architecture",
95
+ legend_title="Attention",
 
96
  width=1200,
97
  height=600,
98
  )
99
 
100
+ return prefill_fig
101
 
102
 
103
+ def get_attn_decode_fig(open_llm_perf_df):
104
+ attn_df = get_attn_df(open_llm_perf_df)
105
+ print(len(attn_df))
106
  # plot
107
+ decode_fig = px.box(
108
+ attn_df,
109
  x="Architecture πŸ›οΈ",
110
+ y="Decode Speedup (%)",
111
  color_discrete_sequence=px.colors.qualitative.Light24,
112
+ custom_data=ATTN_DATA,
113
+ color="Attention πŸ‘οΈ other",
114
  points="all",
115
  )
116
  # add hover data
117
+ decode_fig.update_traces(
118
  hovertemplate="<br>".join(
119
+ [
120
+ f"<b>{column}:</b> %{{customdata[{i}]}}"
121
+ for i, column in enumerate(ATTN_DATA)
122
+ ]
123
  )
124
  )
125
  # add layout
126
+ decode_fig.update_layout(
127
  title={
128
+ "text": "Decode Speedup per Architecture, Compared To Eager Attention",
 
 
129
  "xanchor": "center",
130
  "yanchor": "top",
131
+ "y": 0.95,
132
+ "x": 0.5,
133
  },
134
+ yaxis_title="Decode Speedup (%)",
135
  xaxis_title="LLM Architecture",
136
+ legend_title="Attention",
 
137
  width=1200,
138
  height=600,
139
  )
140
 
141
+ return decode_fig
142
 
143
 
144
+ def create_attn_plots(open_llm_perf_df):
145
  # descriptive text
146
  gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
147
  # get figures
148
+ prefill_fig = get_attn_prefill_fig(open_llm_perf_df)
149
+ decode_fig = get_attn_decode_fig(open_llm_perf_df)
150
 
151
  # create plots
152
+ prefill_plot = gr.components.Plot(
153
+ value=prefill_fig, elem_id="plot", show_label=False
154
+ )
155
  decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
156
 
157
  return prefill_plot, decode_plot
src/bettertransformer.py DELETED
@@ -1,144 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import plotly.express as px
4
-
5
-
6
- BETTERTRANSFORMER_DATA = [
7
- # open llm
8
- "Model πŸ€—",
9
- "DType πŸ“₯",
10
- "Backend 🏭",
11
- "Params (B)",
12
- "Architecture πŸ›οΈ",
13
- "Open LLM Score (%)",
14
- # deployment settings
15
- "DType πŸ“₯",
16
- "Backend 🏭",
17
- "Optimization πŸ› οΈ",
18
- "Quantization πŸ—œοΈ",
19
- "Optimization πŸ› οΈ BetterTransformer",
20
- # primary measurements
21
- "Prefill (s)",
22
- "Prefill (s) BetterTransformer",
23
- "Decode (tokens/s)",
24
- "Decode (tokens/s) BetterTransformer",
25
- "End-to-End (tokens/s)",
26
- "End-to-End (tokens/s) BetterTransformer",
27
- # speedups
28
- "Prefill Speedup (%)",
29
- "Decode Speedup (%)",
30
- ]
31
-
32
-
33
- def get_bt_df(llm_perf_df):
34
- copy_df = llm_perf_df.copy()
35
- # seperate original model experiments from BetterTransformer experiments
36
- original_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "None") & (copy_df["DType πŸ“₯"] == "float16")]
37
- bt_df = copy_df[(copy_df["Optimization πŸ› οΈ"] == "BetterTransformer") & (copy_df["DType πŸ“₯"] == "float16")]
38
- # merge the two dataframes
39
- bt_df = pd.merge(
40
- original_df,
41
- bt_df,
42
- on=["Model πŸ€—", "Quantization πŸ—œοΈ"],
43
- suffixes=["", " BetterTransformer"],
44
- )
45
- # compute speedups
46
- bt_df["Prefill Speedup (%)"] = (
47
- (bt_df["Prefill (s)"] / bt_df["Prefill (s) BetterTransformer"]) * 100
48
- ).round(2) - 100
49
- bt_df["Decode Speedup (%)"] = (
50
- (bt_df["Decode (tokens/s) BetterTransformer"] / bt_df["Decode (tokens/s)"]) * 100
51
- ).round(2) - 100
52
- # filter speedups > 1000%
53
- bt_df = bt_df[bt_df["Prefill Speedup (%)"] < 1000]
54
- bt_df = bt_df[bt_df["Decode Speedup (%)"] < 1000]
55
-
56
- return bt_df
57
-
58
-
59
- def get_bt_prefill_fig(llm_perf_df):
60
- bt_df = get_bt_df(llm_perf_df)
61
- # plot
62
- prefill_fig = px.box(
63
- bt_df,
64
- x="Architecture πŸ›οΈ",
65
- y="Prefill Speedup (%)",
66
- color_discrete_sequence=px.colors.qualitative.Light24,
67
- custom_data=BETTERTRANSFORMER_DATA,
68
- color="Quantization πŸ—œοΈ",
69
- points="all",
70
- )
71
- # add hover data
72
- prefill_fig.update_traces(
73
- hovertemplate="<br>".join(
74
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
75
- )
76
- )
77
- # add layout
78
- prefill_fig.update_layout(
79
- title={
80
- "text": "Prefill Speedup per Architecture, Compared To Non-Optimized Model",
81
- "y": 0.95,
82
- "x": 0.5,
83
- "xanchor": "center",
84
- "yanchor": "top",
85
- },
86
- xaxis_title="LLM Architecture",
87
- yaxis_title="Prefill Speedup (%)",
88
- legend_title="Quantization Scheme",
89
- width=1200,
90
- height=600,
91
- )
92
-
93
- return prefill_fig
94
-
95
-
96
- def get_bt_decode_fig(llm_perf_df):
97
- bt_df = get_bt_df(llm_perf_df)
98
- # plot
99
- decode_fig = px.box(
100
- bt_df,
101
- x="Architecture πŸ›οΈ",
102
- y="Decode Speedup (%)",
103
- color_discrete_sequence=px.colors.qualitative.Light24,
104
- custom_data=BETTERTRANSFORMER_DATA,
105
- color="Quantization πŸ—œοΈ",
106
- points="all",
107
- )
108
- # add hover data
109
- decode_fig.update_traces(
110
- hovertemplate="<br>".join(
111
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
112
- )
113
- )
114
- # add layout
115
- decode_fig.update_layout(
116
- title={
117
- "text": "Decode Speedup per Architecture, Compared To Non-Optimized Model",
118
- "y": 0.95,
119
- "x": 0.5,
120
- "xanchor": "center",
121
- "yanchor": "top",
122
- },
123
- xaxis_title="LLM Architecture",
124
- yaxis_title="Decode Speedup (%)",
125
- legend_title="Quantization Scheme",
126
- width=1200,
127
- height=600,
128
- )
129
-
130
- return decode_fig
131
-
132
-
133
- def create_bt_plots(llm_perf_df):
134
- # descriptive text
135
- gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
136
- # get figures
137
- prefill_fig = get_bt_prefill_fig(llm_perf_df)
138
- decode_fig = get_bt_decode_fig(llm_perf_df)
139
-
140
- # create plots
141
- prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
142
- decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
143
-
144
- return prefill_plot, decode_plot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/content.py CHANGED
@@ -14,7 +14,7 @@ configuration for automated benchmarking:
14
 
15
  - Model evaluation requests should be made in the
16
  [πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
17
- we scrape the list of pretrained base models from there.
18
  - Hardware/Backend/Optimization configuration requests should be made in the
19
  [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
20
  [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
@@ -23,7 +23,7 @@ we scrape the list of pretrained base models from there.
23
 
24
  - To avoid communication-dependent results, only one GPU is used.
25
  - Score is the average evaluation score obtained from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
26
- - LLMs are running on a singleton batch with a prompt size of 256 and generating a 256 tokens.
27
  - Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
28
  - We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
29
 
 
14
 
15
  - Model evaluation requests should be made in the
16
  [πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
17
+ we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
18
  - Hardware/Backend/Optimization configuration requests should be made in the
19
  [πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
20
  [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).
 
23
 
24
  - To avoid communication-dependent results, only one GPU is used.
25
  - Score is the average evaluation score obtained from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
26
+ - LLMs are running on a singleton batch with a prompt size of 256 and generating a 64 tokens for at least 10 iterations and 10 seconds.
27
  - Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
28
  - We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.
29
 
src/control_panel.py CHANGED
@@ -1,14 +1,14 @@
1
  import gradio as gr
2
 
3
- from src.llm_perf import get_llm_perf_df
4
  from src.leaderboard import get_leaderboard_df
5
- from src.latency_score_memory import get_lat_score_mem_fig
6
- from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
7
- from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
8
- from src.quantization_kernels import get_quant_prefill_fig, get_quant_decode_fig
 
9
 
10
 
11
- def create_control_panel(machine: str = "hf-dgx-01"):
12
  # controls
13
  machine_textbox = gr.Textbox(value=machine, visible=False)
14
  with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
@@ -29,7 +29,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
29
  value=80 * 1024,
30
  elem_id="memory-slider",
31
  )
32
- with gr.Column(scale=1):
33
  backend_checkboxes = gr.CheckboxGroup(
34
  label="Backends 🏭",
35
  choices=["pytorch"],
@@ -40,7 +40,7 @@ def create_control_panel(machine: str = "hf-dgx-01"):
40
  with gr.Row():
41
  with gr.Column(scale=1, variant="panel"):
42
  datatype_checkboxes = gr.CheckboxGroup(
43
- label="Load DTypes πŸ“₯",
44
  choices=["float32", "float16", "bfloat16"],
45
  value=["float32", "float16", "bfloat16"],
46
  info="β˜‘οΈ Select the load data types",
@@ -48,13 +48,13 @@ def create_control_panel(machine: str = "hf-dgx-01"):
48
  )
49
  with gr.Column(scale=1, variant="panel"):
50
  optimization_checkboxes = gr.CheckboxGroup(
51
- label="Optimizations πŸ› οΈ",
52
  choices=["None", "BetterTransformer", "FlashAttentionV2"],
53
  value=["None", "BetterTransformer", "FlashAttentionV2"],
54
  info="β˜‘οΈ Select the optimization",
55
  elem_id="optimization-checkboxes",
56
  )
57
- with gr.Column(scale=2):
58
  quantization_checkboxes = gr.CheckboxGroup(
59
  label="Quantizations πŸ—œοΈ",
60
  choices=[
@@ -118,29 +118,29 @@ def filter_fn(
118
  # raw_df["Model πŸ€—"].str.contains(model, case=False)
119
  raw_df["Backend 🏭"].isin(backends)
120
  & raw_df["DType πŸ“₯"].isin(datatypes)
121
- & raw_df["Optimization πŸ› οΈ"].isin(optimizations)
122
  & raw_df["Quantization πŸ—œοΈ"].isin(quantizations)
123
  & (raw_df["Open LLM Score (%)"] >= score)
124
  & (raw_df["Allocated Memory (MB)"] <= memory)
125
  ]
126
  filtered_leaderboard_df = select_fn(machine, columns, search)
127
  filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
128
- filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
129
- filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
130
- filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
131
- filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
132
- filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
133
- filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
134
 
135
  return [
136
  filtered_leaderboard_df,
137
  filtered_lat_score_mem_fig,
138
- filtered_bt_prefill_fig,
139
- filtered_bt_decode_fig,
140
- filtered_fa2_prefill_fig,
141
- filtered_fa2_decode_fig,
142
- filtered_quant_prefill_fig,
143
- filtered_quant_decode_fig,
144
  ]
145
 
146
 
@@ -162,12 +162,12 @@ def create_control_callback(
162
  # outputs
163
  leaderboard_table,
164
  lat_score_mem_plot,
165
- bt_prefill_plot,
166
- bt_decode_plot,
167
- fa2_prefill_plot,
168
- fa2_decode_plot,
169
- quant_prefill_plot,
170
- quant_decode_plot,
171
  ):
172
  filter_button.click(
173
  fn=filter_fn,
@@ -188,19 +188,19 @@ def create_control_callback(
188
  outputs=[
189
  leaderboard_table,
190
  lat_score_mem_plot,
191
- bt_prefill_plot,
192
- bt_decode_plot,
193
- fa2_prefill_plot,
194
- fa2_decode_plot,
195
- quant_prefill_plot,
196
- quant_decode_plot,
197
  ],
198
  )
199
 
200
 
201
  def select_fn(machine, columns, search):
202
- raw_df = get_llm_perf_df(machine=machine)
203
- selected_leaderboard_df = get_leaderboard_df(raw_df)
204
  selected_leaderboard_df = selected_leaderboard_df[
205
  selected_leaderboard_df["Model πŸ€—"].str.contains(search, case=False)
206
  ]
 
1
  import gradio as gr
2
 
 
3
  from src.leaderboard import get_leaderboard_df
4
+ from src.llm_perf import get_llm_perf_df
5
+
6
+ # from attention_implementations import get_attn_decode_fig, get_attn_prefill_fig
7
+ # from custom_kernels import get_kernel_decode_fig, get_kernel_prefill_fig
8
+ from src.map import get_lat_score_mem_fig
9
 
10
 
11
+ def create_control_panel(machine: str):
12
  # controls
13
  machine_textbox = gr.Textbox(value=machine, visible=False)
14
  with gr.Accordion("Control Panel πŸŽ›οΈ", open=False, elem_id="control-panel"):
 
29
  value=80 * 1024,
30
  elem_id="memory-slider",
31
  )
32
+ with gr.Column(scale=1, variant="panel"):
33
  backend_checkboxes = gr.CheckboxGroup(
34
  label="Backends 🏭",
35
  choices=["pytorch"],
 
40
  with gr.Row():
41
  with gr.Column(scale=1, variant="panel"):
42
  datatype_checkboxes = gr.CheckboxGroup(
43
+ label="DTypes πŸ“₯",
44
  choices=["float32", "float16", "bfloat16"],
45
  value=["float32", "float16", "bfloat16"],
46
  info="β˜‘οΈ Select the load data types",
 
48
  )
49
  with gr.Column(scale=1, variant="panel"):
50
  optimization_checkboxes = gr.CheckboxGroup(
51
+ label="Attentions πŸ‘οΈ",
52
  choices=["None", "BetterTransformer", "FlashAttentionV2"],
53
  value=["None", "BetterTransformer", "FlashAttentionV2"],
54
  info="β˜‘οΈ Select the optimization",
55
  elem_id="optimization-checkboxes",
56
  )
57
+ with gr.Column(scale=2, variant="panel"):
58
  quantization_checkboxes = gr.CheckboxGroup(
59
  label="Quantizations πŸ—œοΈ",
60
  choices=[
 
118
  # raw_df["Model πŸ€—"].str.contains(model, case=False)
119
  raw_df["Backend 🏭"].isin(backends)
120
  & raw_df["DType πŸ“₯"].isin(datatypes)
121
+ & raw_df["Attention πŸ‘οΈ"].isin(optimizations)
122
  & raw_df["Quantization πŸ—œοΈ"].isin(quantizations)
123
  & (raw_df["Open LLM Score (%)"] >= score)
124
  & (raw_df["Allocated Memory (MB)"] <= memory)
125
  ]
126
  filtered_leaderboard_df = select_fn(machine, columns, search)
127
  filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
128
+ # filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
129
+ # filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
130
+ # filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
131
+ # filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
132
+ # filtered_quant_prefill_fig = get_quant_prefill_fig(filtered_df)
133
+ # filtered_quant_decode_fig = get_quant_decode_fig(filtered_df)
134
 
135
  return [
136
  filtered_leaderboard_df,
137
  filtered_lat_score_mem_fig,
138
+ # filtered_bt_prefill_fig,
139
+ # filtered_bt_decode_fig,
140
+ # filtered_fa2_prefill_fig,
141
+ # filtered_fa2_decode_fig,
142
+ # filtered_quant_prefill_fig,
143
+ # filtered_quant_decode_fig,
144
  ]
145
 
146
 
 
162
  # outputs
163
  leaderboard_table,
164
  lat_score_mem_plot,
165
+ # attn_prefill_plot,
166
+ # attn_decode_plot,
167
+ # fa2_prefill_plot,
168
+ # fa2_decode_plot,
169
+ # quant_prefill_plot,
170
+ # quant_decode_plot,
171
  ):
172
  filter_button.click(
173
  fn=filter_fn,
 
188
  outputs=[
189
  leaderboard_table,
190
  lat_score_mem_plot,
191
+ # attn_prefill_plot,
192
+ # attn_decode_plot,
193
+ # fa2_prefill_plot,
194
+ # fa2_decode_plot,
195
+ # quant_prefill_plot,
196
+ # quant_decode_plot,
197
  ],
198
  )
199
 
200
 
201
  def select_fn(machine, columns, search):
202
+ llm_perf_df = get_llm_perf_df(machine=machine)
203
+ selected_leaderboard_df = get_leaderboard_df(llm_perf_df)
204
  selected_leaderboard_df = selected_leaderboard_df[
205
  selected_leaderboard_df["Model πŸ€—"].str.contains(search, case=False)
206
  ]
src/{quantization_kernels.py β†’ kernels.py} RENAMED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
 
5
-
6
  QUANT_DATA = [
7
  # open llm
8
  "Model πŸ€—",
@@ -14,9 +13,7 @@ QUANT_DATA = [
14
  # deployment settings
15
  "DType πŸ“₯",
16
  "Backend 🏭",
17
- "Optimization πŸ› οΈ",
18
  "Quantization πŸ—œοΈ",
19
- "Optimization πŸ› οΈ Custom Kernel",
20
  "Quantization πŸ—œοΈ Custom Kernel",
21
  # primary measurements
22
  "Prefill (s)",
@@ -34,9 +31,8 @@ def get_quant_df(llm_perf_df):
34
  # seperate vanilla GPTQ experiments from Custom Kernel experiments
35
  vanilla_df = copy_df[
36
  (copy_df["Backend 🏭"] == "pytorch")
37
- & (copy_df["Quantization πŸ—œοΈ"] == "None")
38
- & (copy_df["Optimization πŸ› οΈ"] == "None")
39
  & (copy_df["DType πŸ“₯"] == "float16")
 
40
  ]
41
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
42
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
@@ -70,11 +66,12 @@ def get_quant_df(llm_perf_df):
70
  # concat the two dataframes row-wise
71
  quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
72
  # compute speedups
73
- quant_df["Prefill Speedup (%)"] = ((quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100).round(
74
- 2
75
- ) - 100
76
  quant_df["Decode Speedup (%)"] = (
77
- (quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"]) * 100
 
78
  ).round(2) - 100
79
  # filter speedups > 1000%
80
  quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
@@ -97,7 +94,12 @@ def get_quant_decode_fig(llm_perf_df):
97
  )
98
  # add hover data
99
  decode_fig.update_traces(
100
- hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
 
 
 
 
 
101
  )
102
  # add layout
103
  decode_fig.update_layout(
@@ -132,7 +134,12 @@ def get_quant_prefill_fig(llm_perf_df):
132
  )
133
  # add hover data
134
  prefill_fig.update_traces(
135
- hovertemplate="<br>".join([f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(QUANT_DATA)])
 
 
 
 
 
136
  )
137
  # add layout
138
  prefill_fig.update_layout(
@@ -161,7 +168,9 @@ def create_quant_plots(llm_perf_df):
161
  decode_fig = get_quant_decode_fig(llm_perf_df)
162
 
163
  # create plots
164
- prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
 
 
165
  decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
166
 
167
  return prefill_plot, decode_plot
 
2
  import pandas as pd
3
  import plotly.express as px
4
 
 
5
  QUANT_DATA = [
6
  # open llm
7
  "Model πŸ€—",
 
13
  # deployment settings
14
  "DType πŸ“₯",
15
  "Backend 🏭",
 
16
  "Quantization πŸ—œοΈ",
 
17
  "Quantization πŸ—œοΈ Custom Kernel",
18
  # primary measurements
19
  "Prefill (s)",
 
31
  # seperate vanilla GPTQ experiments from Custom Kernel experiments
32
  vanilla_df = copy_df[
33
  (copy_df["Backend 🏭"] == "pytorch")
 
 
34
  & (copy_df["DType πŸ“₯"] == "float16")
35
+ & (copy_df["Quantization πŸ—œοΈ"] == "None")
36
  ]
37
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
 
66
  # concat the two dataframes row-wise
67
  quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
68
  # compute speedups
69
+ quant_df["Prefill Speedup (%)"] = (
70
+ (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
71
+ ).round(2) - 100
72
  quant_df["Decode Speedup (%)"] = (
73
+ (quant_df["Decode (tokens/s) Custom Kernel"] / quant_df["Decode (tokens/s)"])
74
+ * 100
75
  ).round(2) - 100
76
  # filter speedups > 1000%
77
  quant_df = quant_df[quant_df["Prefill Speedup (%)"] < 1000]
 
94
  )
95
  # add hover data
96
  decode_fig.update_traces(
97
+ hovertemplate="<br>".join(
98
+ [
99
+ f"<b>{column}:</b> %{{customdata[{i}]}}"
100
+ for i, column in enumerate(QUANT_DATA)
101
+ ]
102
+ )
103
  )
104
  # add layout
105
  decode_fig.update_layout(
 
134
  )
135
  # add hover data
136
  prefill_fig.update_traces(
137
+ hovertemplate="<br>".join(
138
+ [
139
+ f"<b>{column}:</b> %{{customdata[{i}]}}"
140
+ for i, column in enumerate(QUANT_DATA)
141
+ ]
142
+ )
143
  )
144
  # add layout
145
  prefill_fig.update_layout(
 
168
  decode_fig = get_quant_decode_fig(llm_perf_df)
169
 
170
  # create plots
171
+ prefill_plot = gr.components.Plot(
172
+ value=prefill_fig, elem_id="plot", show_label=False
173
+ )
174
  decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
175
 
176
  return prefill_plot, decode_plot
src/leaderboard.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
 
3
  from src.utils import model_hyperlink, process_score
4
 
5
-
6
  LEADERBOARD_COLUMN_TO_DATATYPE = {
7
  # open llm
8
  "Model πŸ€—": "markdown",
@@ -13,18 +12,18 @@ LEADERBOARD_COLUMN_TO_DATATYPE = {
13
  "Memory (MB)": "number",
14
  "Energy (tokens/kWh)": "number",
15
  # deployment settings
16
- "DType πŸ“₯": "str",
17
  "Backend 🏭": "str",
18
- "Optimization πŸ› οΈ": "str",
19
  "Quantization πŸ—œοΈ": "str",
 
 
20
  # additional measurements
21
- "Architecture πŸ›οΈ": "markdown",
22
- "Params (B)": "number",
23
  "Open LLM Score (%)": "number",
24
  "End-to-End (s)": "number",
25
- "End-to-End (tokens/s)": "number",
26
- "Reserved Memory (MB)": "number",
27
- "Used Memory (MB)": "number",
28
  }
29
 
30
  PRIMARY_COLUMNS = [
 
2
 
3
  from src.utils import model_hyperlink, process_score
4
 
 
5
  LEADERBOARD_COLUMN_TO_DATATYPE = {
6
  # open llm
7
  "Model πŸ€—": "markdown",
 
12
  "Memory (MB)": "number",
13
  "Energy (tokens/kWh)": "number",
14
  # deployment settings
 
15
  "Backend 🏭": "str",
16
+ "Precision πŸ“₯": "str",
17
  "Quantization πŸ—œοΈ": "str",
18
+ "Attention πŸ‘οΈ": "str",
19
+ "Kernel βš›οΈ": "str",
20
  # additional measurements
21
+ # "Reserved Memory (MB)": "number",
22
+ # "Used Memory (MB)": "number",
23
  "Open LLM Score (%)": "number",
24
  "End-to-End (s)": "number",
25
+ "Architecture πŸ›οΈ": "str",
26
+ "Params (B)": "number",
 
27
  }
28
 
29
  PRIMARY_COLUMNS = [
src/llm_perf.py CHANGED
@@ -1,123 +1,98 @@
1
  import os
2
 
3
  import pandas as pd
4
- from huggingface_hub import hf_hub_download
5
 
6
- from .utils import process_quantization_scheme, process_arch
7
-
8
- LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
9
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
10
 
11
  COLUMNS_MAPPING = {
12
- "Model": "Model πŸ€—",
13
- "experiment_name": "Experiment πŸ§ͺ",
14
  # primary measurements
15
- "forward.latency(s)": "Prefill (s)",
16
- "decode.throughput(tokens/s)": "Decode (tokens/s)",
17
- "generate.max_memory_allocated(MB)": "Memory (MB)",
18
- "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
 
19
  # deployment settings
20
- "backend.name": "Backend 🏭",
21
- "backend.torch_dtype": "DType πŸ“₯",
22
- "optimization": "Optimization πŸ› οΈ",
23
  "quantization": "Quantization πŸ—œοΈ",
24
- # additional measurements
25
- "Size": "Params (B)",
26
- "Arch": "Architecture πŸ›οΈ",
27
- "Score": "Open LLM Score (%)",
28
- "generate.latency(s)": "End-to-End (s)",
29
- "generate.throughput(tokens/s)": "End-to-End (tokens/s)",
30
- "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
31
- "generate.max_memory_used(MB)": "Used Memory (MB)",
32
  }
33
- SORTING_COLUMNS = [
34
- "Open LLM Score (%)",
35
- "Decode (tokens/s)",
36
- "Prefill (s)",
37
- ]
38
  SORTING_ASCENDING = [False, True, False]
39
 
40
 
41
- def get_llm_df():
42
- # commented for now since scraping script is not working
43
- hf_hub_download(
44
- repo_id=LLM_PERF_DATASET_REPO,
45
- filename="open-llm.csv",
46
- local_dir="dataset",
47
- repo_type="dataset",
48
- token=HF_TOKEN,
49
- )
50
-
51
- llm_df = pd.read_csv("dataset/open-llm.csv")
52
-
53
- return llm_df
54
 
 
 
 
 
55
 
56
- def get_perf_df(machine: str = "hf-dgx-01"):
57
- hf_hub_download(
58
- repo_id=LLM_PERF_DATASET_REPO,
59
- filename=f"{machine}/perf-report.csv",
60
- local_dir="dataset",
61
- repo_type="dataset",
62
- token=HF_TOKEN,
63
  )
64
- perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
65
 
66
- return perf_df
67
 
68
 
69
- def get_llm_perf_df(machine: str = "hf-dgx-01"):
70
- # get dataframes
71
- llm_df = get_llm_df()
72
- perf_df = get_perf_df(machine=machine)
73
- llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
74
  # some assertions
75
- assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
76
- assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
77
- assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
78
- # transpose energy consumption
79
- llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
80
- 1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
81
- ).astype(int)
82
- # fix nan values
83
- llm_perf_df.loc[
84
- llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
85
- "generate.energy_consumption(tokens/kWh)",
86
- ] = pd.NA
87
-
88
- # add optimization column
89
- llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
90
- lambda x: (
91
- "BetterTransformer"
92
- if x["backend.to_bettertransformer"]
93
- else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None")
94
- ),
95
- axis=1,
96
  )
97
- # add quantization scheme
98
- llm_perf_df["quantization"] = llm_perf_df[
99
- [
100
- "backend.quantization_scheme",
101
- "backend.quantization_config.bits",
102
- "backend.quantization_config.version",
103
- "backend.quantization_config.load_in_4bit",
104
- "backend.quantization_config.load_in_8bit",
105
- "backend.quantization_config.exllama_config.version",
106
- ]
107
- ].apply(lambda x: process_quantization_scheme(x), axis=1)
108
- # process experiment name
109
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", ""))
110
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(
111
- lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit"))
114
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit"))
115
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit"))
116
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit"))
117
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA"))
118
- llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2"))
119
- # add arch
120
- llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
121
  # filter columns
122
  llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
123
  # rename columns
@@ -130,3 +105,14 @@ def get_llm_perf_df(machine: str = "hf-dgx-01"):
130
  )
131
 
132
  return llm_perf_df
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
  import pandas as pd
 
4
 
5
+ from .utils import process_kernels, process_quantizations
 
 
 
6
 
7
  COLUMNS_MAPPING = {
8
+ "config.name": "Experiment πŸ§ͺ",
9
+ "config.backend.model": "Model πŸ€—",
10
  # primary measurements
11
+ "report.prefill.latency.p50": "Prefill (s)",
12
+ "report.per_token.latency.p50": "Per Token (s)",
13
+ "report.decode.throughput.value": "Decode (tokens/s)",
14
+ "report.decode.efficiency.value": "Energy (tokens/kWh)",
15
+ "report.decode.memory.max_allocated": "Memory (MB)",
16
  # deployment settings
17
+ "config.backend.name": "Backend 🏭",
18
+ "config.backend.torch_dtype": "Precision πŸ“₯",
 
19
  "quantization": "Quantization πŸ—œοΈ",
20
+ "attention": "Attention πŸ‘οΈ",
21
+ "kernel": "Kernel βš›οΈ",
22
+ # additional information
23
+ "architecture": "Architecture πŸ›οΈ",
24
+ "prefill+decode": "End-to-End (s)",
25
+ "Average ⬆️": "Open LLM Score (%)",
26
+ "#Params (B)": "Params (B)",
 
27
  }
28
+ SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
29
+ SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
 
 
 
30
  SORTING_ASCENDING = [False, True, False]
31
 
32
 
33
+ def get_raw_llm_perf_df(machine: str = "1xA10"):
34
+ dfs = []
35
+ for subset in SUBSETS:
36
+ try:
37
+ dfs.append(
38
+ pd.read_csv(
39
+ f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-perf-leaderboard-{subset}-{machine}.csv"
40
+ )
41
+ )
42
+ except Exception:
43
+ print(f"Subset {subset} for machine {machine} not found")
 
 
44
 
45
+ llm_perf_df = pd.concat(dfs)
46
+ open_llm_df = pd.read_csv(
47
+ "hf://datasets/optimum-benchmark/open-llm-leaderboard/open-llm-leaderboard.csv"
48
+ )
49
 
50
+ llm_perf_df = pd.merge(
51
+ open_llm_df, llm_perf_df, left_on="Model", right_on="config.backend.model"
 
 
 
 
 
52
  )
 
53
 
54
+ return llm_perf_df
55
 
56
 
57
+ def processed_llm_perf_df(llm_perf_df):
 
 
 
 
58
  # some assertions
59
+ assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
60
+ assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
61
+ assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
62
+ assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
63
+ # fix couple stuff
64
+ llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
65
+ "flash_attention_2", "fa2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
+ llm_perf_df["prefill+decode"] = (
68
+ llm_perf_df["report.prefill.latency.p50"]
69
+ + (llm_perf_df["report.decode.latency.p50"])
70
+ )
71
+ # llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
72
+ # process_architectures
73
+ # )
74
+ llm_perf_df["architecture"] = llm_perf_df["Architecture"]
75
+ llm_perf_df["attention"] = (
76
+ llm_perf_df["config.backend.attn_implementation"]
77
+ .str.replace("flash_attention_2", "FAv2")
78
+ .str.replace("eager", "Eager")
79
+ .str.replace("sdpa", "SDPA")
80
+ )
81
+ llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
82
+ llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
83
+ # round numerical columns
84
+ llm_perf_df = llm_perf_df.round(
85
+ {
86
+ "report.prefill.latency.p50": 3,
87
+ "report.decode.latency.p50": 3,
88
+ "report.decode.throughput.value": 3,
89
+ "report.decode.efficiency.value": 3,
90
+ "report.decode.memory.max_allocated": 3,
91
+ "Average ⬆️": 3,
92
+ "prefill+decode": 3,
93
+ "#Params (B)": 3,
94
+ }
95
  )
 
 
 
 
 
 
 
 
96
  # filter columns
97
  llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
98
  # rename columns
 
105
  )
106
 
107
  return llm_perf_df
108
+
109
+
110
+ def get_llm_perf_df(machine: str = "1xA10"):
111
+ if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
112
+ llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
113
+ else:
114
+ llm_perf_df = get_raw_llm_perf_df(machine)
115
+ llm_perf_df = processed_llm_perf_df(llm_perf_df)
116
+ llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
117
+
118
+ return llm_perf_df
src/{latency_score_memory.py β†’ map.py} RENAMED
@@ -1,20 +1,20 @@
1
  import gradio as gr
2
  import plotly.express as px
3
 
4
-
5
  SCORE_MEMORY_LATENCY_DATA = [
6
  "Model πŸ€—",
7
- "DType πŸ“₯",
8
  "Backend 🏭",
 
9
  "Params (B)",
10
- "Architecture πŸ›οΈ",
11
- "Optimization πŸ› οΈ",
12
  "Quantization πŸ—œοΈ",
 
 
13
  "Open LLM Score (%)",
14
  "Prefill (s)",
15
  "Decode (tokens/s)",
16
  "Memory (MB)",
17
  "End-to-End (s)",
 
18
  ]
19
 
20
 
@@ -32,7 +32,10 @@ def get_lat_score_mem_fig(llm_perf_df):
32
  )
33
  fig.update_traces(
34
  hovertemplate="<br>".join(
35
- [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
 
 
 
36
  )
37
  )
38
  fig.update_layout(
@@ -43,7 +46,7 @@ def get_lat_score_mem_fig(llm_perf_df):
43
  "xanchor": "center",
44
  "yanchor": "top",
45
  },
46
- xaxis_title="Time To Generate 256 Tokens (s)",
47
  yaxis_title="Open LLM Score (%)",
48
  legend_title="LLM Architecture",
49
  width=1200,
 
1
  import gradio as gr
2
  import plotly.express as px
3
 
 
4
  SCORE_MEMORY_LATENCY_DATA = [
5
  "Model πŸ€—",
 
6
  "Backend 🏭",
7
+ "Precision πŸ“₯",
8
  "Params (B)",
 
 
9
  "Quantization πŸ—œοΈ",
10
+ "Attention πŸ‘οΈ",
11
+ "Kernel βš›οΈ",
12
  "Open LLM Score (%)",
13
  "Prefill (s)",
14
  "Decode (tokens/s)",
15
  "Memory (MB)",
16
  "End-to-End (s)",
17
+ "Architecture πŸ›οΈ",
18
  ]
19
 
20
 
 
32
  )
33
  fig.update_traces(
34
  hovertemplate="<br>".join(
35
+ [
36
+ f"<b>{column}:</b> %{{customdata[{i}]}}"
37
+ for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)
38
+ ]
39
  )
40
  )
41
  fig.update_layout(
 
46
  "xanchor": "center",
47
  "yanchor": "top",
48
  },
49
+ xaxis_title="Time To Generate 64 Tokens (s)",
50
  yaxis_title="Open LLM Score (%)",
51
  legend_title="LLM Architecture",
52
  width=1200,
src/utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  LLM_MODEL_ARCHS = {
2
  "stablelm_epoch": "πŸ”΄ StableLM-Epoch",
3
  "stablelm_alpha": "πŸ”΄ StableLM-Alpha",
@@ -16,16 +18,14 @@ LLM_MODEL_ARCHS = {
16
  "llama": "πŸ¦™ LLaMA",
17
  "rwkv": "πŸ¦β€β¬› RWKV",
18
  "deci": "πŸ”΅ deci",
19
- "Yi": "πŸ«‚ Yi δΊΊ", # people
20
  "mpt": "🧱 MPT",
21
  # suggest something
22
  "gpt_neox": "GPT-NeoX",
23
  "gpt_neo": "GPT-Neo",
24
  "gpt2": "GPT-2",
25
  "gptj": "GPT-J",
26
- "xglm": "XGLM",
27
  "bart": "BART",
28
- "opt": "OPT",
29
  }
30
 
31
 
@@ -33,11 +33,13 @@ def model_hyperlink(link, model_name):
33
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
34
 
35
 
36
- def process_arch(model_arch):
37
- if model_arch in LLM_MODEL_ARCHS:
38
- return LLM_MODEL_ARCHS[model_arch]
39
- else:
40
- return model_arch
 
 
41
 
42
 
43
  def process_score(score, quantization):
@@ -47,25 +49,53 @@ def process_score(score, quantization):
47
  return f"{score:.2f} "
48
 
49
 
50
- def process_quantization_scheme(x):
51
- if x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_4bit"] == True:
 
 
 
52
  return "BnB.4bit"
53
- elif x["backend.quantization_scheme"] == "bnb" and x["backend.quantization_config.load_in_8bit"] == True:
54
- return "BnB.8bit"
55
- elif (x["backend.quantization_scheme"] == "gptq") and (
56
- x["backend.quantization_config.exllama_config.version"] == 1
57
  ):
58
- return "GPTQ.4bit+ExllamaV1"
59
- elif (x["backend.quantization_scheme"] == "gptq") and (
60
- x["backend.quantization_config.exllama_config.version"] == 2
 
61
  ):
62
- return "GPTQ.4bit+ExllamaV2"
63
- elif x["backend.quantization_scheme"] == "gptq" and x["backend.quantization_config.bits"] == 4:
64
  return "GPTQ.4bit"
65
- elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemm":
66
- return "AWQ.4bit+GEMM"
67
- elif x["backend.quantization_scheme"] == "awq" and x["backend.quantization_config.version"] == "gemv":
68
- return "AWQ.4bit+GEMV"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  else:
70
  return "None"
71
 
 
1
+ from transformers import AutoConfig
2
+
3
  LLM_MODEL_ARCHS = {
4
  "stablelm_epoch": "πŸ”΄ StableLM-Epoch",
5
  "stablelm_alpha": "πŸ”΄ StableLM-Alpha",
 
18
  "llama": "πŸ¦™ LLaMA",
19
  "rwkv": "πŸ¦β€β¬› RWKV",
20
  "deci": "πŸ”΅ deci",
21
+ "Yi": "πŸ«‚ Yi δΊΊ", # people
22
  "mpt": "🧱 MPT",
23
  # suggest something
24
  "gpt_neox": "GPT-NeoX",
25
  "gpt_neo": "GPT-Neo",
26
  "gpt2": "GPT-2",
27
  "gptj": "GPT-J",
 
28
  "bart": "BART",
 
29
  }
30
 
31
 
 
33
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
34
 
35
 
36
+ def process_architectures(model):
37
+ # return "Unknown"
38
+ try:
39
+ config = AutoConfig.from_pretrained(model, trust_remote_code=True)
40
+ return LLM_MODEL_ARCHS.get(config.model_type, "Unknown")
41
+ except Exception:
42
+ return "Unknown"
43
 
44
 
45
  def process_score(score, quantization):
 
49
  return f"{score:.2f} "
50
 
51
 
52
+ def process_quantizations(x):
53
+ if (
54
+ x["config.backend.quantization_scheme"] == "bnb"
55
+ and x["config.backend.quantization_config.load_in_4bit"] is True
56
+ ):
57
  return "BnB.4bit"
58
+ elif (
59
+ x["config.backend.quantization_scheme"] == "bnb"
60
+ and x["config.backend.quantization_config.load_in_8bit"] is True
 
61
  ):
62
+ return "BnB.8bit"
63
+ elif (
64
+ x["config.backend.quantization_scheme"] == "gptq"
65
+ and x["config.backend.quantization_config.bits"] == 4
66
  ):
 
 
67
  return "GPTQ.4bit"
68
+ elif (
69
+ x["config.backend.quantization_scheme"] == "awq"
70
+ and x["config.backend.quantization_config.bits"] == 4
71
+ ):
72
+ return "AWQ.4bit"
73
+ else:
74
+ return "None"
75
+
76
+
77
+ def process_kernels(x):
78
+ if (
79
+ x["config.backend.quantization_scheme"] == "gptq"
80
+ and x["config.backend.quantization_config.version"] == 1
81
+ ):
82
+ return "GPTQ.ExllamaV1"
83
+
84
+ elif (
85
+ x["config.backend.quantization_scheme"] == "gptq"
86
+ and x["config.backend.quantization_config.version"] == 2
87
+ ):
88
+ return "GPTQ.ExllamaV2"
89
+ elif (
90
+ x["config.backend.quantization_scheme"] == "awq"
91
+ and x["config.backend.quantization_config.version"] == "gemm"
92
+ ):
93
+ return "AWQ.GEMM"
94
+ elif (
95
+ x["config.backend.quantization_scheme"] == "awq"
96
+ and x["config.backend.quantization_config.version"] == "gemv"
97
+ ):
98
+ return "AWQ.GEMV"
99
  else:
100
  return "None"
101