cyberosa commited on
Commit
ac62b55
·
1 Parent(s): af4f5ae

Prints and formatting

Browse files
Files changed (3) hide show
  1. app.py +89 -39
  2. tabs/dashboard.py +3 -1
  3. tabs/run_benchmark.py +15 -5
app.py CHANGED
@@ -8,7 +8,7 @@ from tabs.faq import (
8
  about_olas_predict_benchmark,
9
  about_olas_predict,
10
  about_the_dataset,
11
- about_the_tools
12
  )
13
  from tabs.howto_benchmark import how_to_run
14
  from tabs.run_benchmark import run_benchmark_main
@@ -17,17 +17,36 @@ from tabs.run_benchmark import run_benchmark_main
17
  demo = gr.Blocks()
18
 
19
 
20
- def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
 
 
 
 
 
 
 
21
  """Run the benchmark using inputs."""
22
  if tool_name is None:
23
  return "Please enter the name of your tool."
24
- if openai_api_key is None and anthropic_api_key is None and openrouter_api_key is None:
 
 
 
 
25
  return "Please enter either OpenAI or Anthropic or OpenRouter API key."
26
-
27
- result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key)
28
- if result == 'completed':
 
 
 
 
 
 
 
 
29
  # get the results file in the results directory
30
- fns = glob('results/*.csv')
31
 
32
  print(f"Number of files in results directory: {len(fns)}")
33
 
@@ -35,10 +54,10 @@ def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, a
35
  files = [Path(file) for file in fns]
36
 
37
  # get results and summary files
38
- results_files = [file for file in files if 'results' in file.name]
39
 
40
  # the other file is the summary file
41
- summary_files = [file for file in files if 'summary' in file.name]
42
 
43
  print(results_files, summary_files)
44
 
@@ -51,13 +70,17 @@ def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, a
51
  summary_df = summary_df.round(4)
52
 
53
  return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
54
-
55
- return gr.Textbox(label="Benchmark Result", value=result, interactive=False), gr.Textbox(label="Summary", value="")
 
 
56
 
57
 
58
  with demo:
59
  gr.HTML("<h1>Olas Predict Benchmark</hjson>")
60
- gr.Markdown("Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project.")
 
 
61
 
62
  with gr.Tabs() as tabs:
63
  # first tab - leaderboard
@@ -82,7 +105,6 @@ with demo:
82
  with gr.Accordion("About Olas", open=False):
83
  gr.Markdown(about_olas_predict)
84
 
85
-
86
  # third tab - how to run the benchmark
87
  with gr.TabItem("🚀 Contribute"):
88
  gr.Markdown(how_to_run)
@@ -97,34 +119,53 @@ with demo:
97
  # "prediction-online-summarized-info",
98
  # "prediction-offline-sme",
99
  # "prediction-online-sme",
100
- 'prediction-request-rag',
101
- 'prediction-request-reasoning',
102
  # "prediction-url-cot-claude",
103
  # "prediction-request-rag-cohere",
104
  # "prediction-with-research-conservative",
105
  # "prediction-with-research-bold",
106
- ], label="Tool Name", info="Choose the tool to run")
107
- model_name = gr.Dropdown([
108
- "gpt-3.5-turbo-0125",
109
- "gpt-4-0125-preview",
110
- "claude-3-haiku-20240307",
111
- "claude-3-sonnet-20240229",
112
- "claude-3-opus-20240229",
113
- "databricks/dbrx-instruct:nitro",
114
- "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
115
- # "cohere/command-r-plus",
116
- ], label="Model Name", info="Choose the model to use")
 
 
 
 
 
 
 
117
  with gr.Row():
118
- openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
119
- anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
120
- openrouter_api_key = gr.Textbox(label="OpenRouter API Key", placeholder="Enter your OpenRouter API key here", type="password")
 
 
 
 
 
 
 
 
 
 
 
 
121
  with gr.Row():
122
  num_questions = gr.Slider(
123
- minimum=1,
124
- maximum=340,
125
- value=10,
126
- label="Number of questions to run the benchmark on",
127
- )
128
  with gr.Row():
129
  run_button = gr.Button("Run Benchmark")
130
  with gr.Row():
@@ -133,10 +174,19 @@ with demo:
133
  with gr.Row():
134
  with gr.Accordion("Summary", open=False):
135
  summary = gr.Dataframe()
136
-
137
- run_button.click(run_benchmark_gradio,
138
- inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key],
139
- outputs=[result, summary])
 
 
 
 
 
 
 
 
 
140
 
141
 
142
- demo.queue(default_concurrency_limit=40).launch()
 
8
  about_olas_predict_benchmark,
9
  about_olas_predict,
10
  about_the_dataset,
11
+ about_the_tools,
12
  )
13
  from tabs.howto_benchmark import how_to_run
14
  from tabs.run_benchmark import run_benchmark_main
 
17
  demo = gr.Blocks()
18
 
19
 
20
+ def run_benchmark_gradio(
21
+ tool_name,
22
+ model_name,
23
+ num_questions,
24
+ openai_api_key,
25
+ anthropic_api_key,
26
+ openrouter_api_key,
27
+ ):
28
  """Run the benchmark using inputs."""
29
  if tool_name is None:
30
  return "Please enter the name of your tool."
31
+ if (
32
+ openai_api_key is None
33
+ and anthropic_api_key is None
34
+ and openrouter_api_key is None
35
+ ):
36
  return "Please enter either OpenAI or Anthropic or OpenRouter API key."
37
+
38
+ result = run_benchmark_main(
39
+ tool_name,
40
+ model_name,
41
+ num_questions,
42
+ openai_api_key,
43
+ anthropic_api_key,
44
+ openrouter_api_key,
45
+ )
46
+
47
+ if result == "completed":
48
  # get the results file in the results directory
49
+ fns = glob("results/*.csv")
50
 
51
  print(f"Number of files in results directory: {len(fns)}")
52
 
 
54
  files = [Path(file) for file in fns]
55
 
56
  # get results and summary files
57
+ results_files = [file for file in files if "results" in file.name]
58
 
59
  # the other file is the summary file
60
+ summary_files = [file for file in files if "summary" in file.name]
61
 
62
  print(results_files, summary_files)
63
 
 
70
  summary_df = summary_df.round(4)
71
 
72
  return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
73
+
74
+ return gr.Textbox(
75
+ label="Benchmark Result", value=result, interactive=False
76
+ ), gr.Textbox(label="Summary", value="")
77
 
78
 
79
  with demo:
80
  gr.HTML("<h1>Olas Predict Benchmark</hjson>")
81
+ gr.Markdown(
82
+ "Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project."
83
+ )
84
 
85
  with gr.Tabs() as tabs:
86
  # first tab - leaderboard
 
105
  with gr.Accordion("About Olas", open=False):
106
  gr.Markdown(about_olas_predict)
107
 
 
108
  # third tab - how to run the benchmark
109
  with gr.TabItem("🚀 Contribute"):
110
  gr.Markdown(how_to_run)
 
119
  # "prediction-online-summarized-info",
120
  # "prediction-offline-sme",
121
  # "prediction-online-sme",
122
+ "prediction-request-rag",
123
+ "prediction-request-reasoning",
124
  # "prediction-url-cot-claude",
125
  # "prediction-request-rag-cohere",
126
  # "prediction-with-research-conservative",
127
  # "prediction-with-research-bold",
128
+ ],
129
+ label="Tool Name",
130
+ info="Choose the tool to run",
131
+ )
132
+ model_name = gr.Dropdown(
133
+ [
134
+ "gpt-3.5-turbo-0125",
135
+ "gpt-4-0125-preview",
136
+ "claude-3-haiku-20240307",
137
+ "claude-3-sonnet-20240229",
138
+ "claude-3-opus-20240229",
139
+ "databricks/dbrx-instruct:nitro",
140
+ "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
141
+ # "cohere/command-r-plus",
142
+ ],
143
+ label="Model Name",
144
+ info="Choose the model to use",
145
+ )
146
  with gr.Row():
147
+ openai_api_key = gr.Textbox(
148
+ label="OpenAI API Key",
149
+ placeholder="Enter your OpenAI API key here",
150
+ type="password",
151
+ )
152
+ anthropic_api_key = gr.Textbox(
153
+ label="Anthropic API Key",
154
+ placeholder="Enter your Anthropic API key here",
155
+ type="password",
156
+ )
157
+ openrouter_api_key = gr.Textbox(
158
+ label="OpenRouter API Key",
159
+ placeholder="Enter your OpenRouter API key here",
160
+ type="password",
161
+ )
162
  with gr.Row():
163
  num_questions = gr.Slider(
164
+ minimum=1,
165
+ maximum=340,
166
+ value=10,
167
+ label="Number of questions to run the benchmark on",
168
+ )
169
  with gr.Row():
170
  run_button = gr.Button("Run Benchmark")
171
  with gr.Row():
 
174
  with gr.Row():
175
  with gr.Accordion("Summary", open=False):
176
  summary = gr.Dataframe()
177
+
178
+ run_button.click(
179
+ run_benchmark_gradio,
180
+ inputs=[
181
+ tool_name,
182
+ model_name,
183
+ num_questions,
184
+ openai_api_key,
185
+ anthropic_api_key,
186
+ openrouter_api_key,
187
+ ],
188
+ outputs=[result, summary],
189
+ )
190
 
191
 
192
+ demo.queue(default_concurrency_limit=40).launch()
tabs/dashboard.py CHANGED
@@ -3,8 +3,10 @@ import pandas as pd
3
 
4
  csv_file_path = "formatted_data.csv"
5
 
 
6
  def return_df():
7
  # Reading the CSV file
 
8
  df = pd.read_csv(csv_file_path)
9
 
10
  # all floats to be rounded to 2 decimal places
@@ -12,4 +14,4 @@ def return_df():
12
  return df
13
 
14
 
15
- df = return_df()
 
3
 
4
  csv_file_path = "formatted_data.csv"
5
 
6
+
7
  def return_df():
8
  # Reading the CSV file
9
+ print("Reading csv file with results")
10
  df = pd.read_csv(csv_file_path)
11
 
12
  # all floats to be rounded to 2 decimal places
 
14
  return df
15
 
16
 
17
+ df = return_df()
tabs/run_benchmark.py CHANGED
@@ -2,8 +2,17 @@ import os
2
  from benchmark.run_benchmark import run_benchmark
3
 
4
 
5
- def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
 
 
 
 
 
 
 
6
  """Run the benchmark using the provided function and API key."""
 
 
7
  # Empyt the results directory
8
  os.system("rm -rf results/*")
9
 
@@ -30,7 +39,10 @@ def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, ant
30
  else:
31
  kwargs["llm_provider"] = "openrouter"
32
 
33
- if tool_name == "prediction-request-reasoning" or tool_name == "prediction-request-rag":
 
 
 
34
  if not openai_api_key:
35
  return f"Error: Tools that use RAG also require an OpenAI API Key"
36
 
@@ -39,12 +51,10 @@ def run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, ant
39
  kwargs["provide_source_links"] = True
40
 
41
  print(f"Running benchmark")
42
-
43
  # Run the benchmark
44
  try:
45
  run_benchmark(kwargs=kwargs)
46
  return "completed"
47
  except Exception as e:
48
  return f"Error running benchmark: {e}"
49
-
50
-
 
2
  from benchmark.run_benchmark import run_benchmark
3
 
4
 
5
+ def run_benchmark_main(
6
+ tool_name,
7
+ model_name,
8
+ num_questions,
9
+ openai_api_key,
10
+ anthropic_api_key,
11
+ openrouter_api_key,
12
+ ):
13
  """Run the benchmark using the provided function and API key."""
14
+
15
+ print("Running benchmark for the provided api keys")
16
  # Empyt the results directory
17
  os.system("rm -rf results/*")
18
 
 
39
  else:
40
  kwargs["llm_provider"] = "openrouter"
41
 
42
+ if (
43
+ tool_name == "prediction-request-reasoning"
44
+ or tool_name == "prediction-request-rag"
45
+ ):
46
  if not openai_api_key:
47
  return f"Error: Tools that use RAG also require an OpenAI API Key"
48
 
 
51
  kwargs["provide_source_links"] = True
52
 
53
  print(f"Running benchmark")
54
+
55
  # Run the benchmark
56
  try:
57
  run_benchmark(kwargs=kwargs)
58
  return "completed"
59
  except Exception as e:
60
  return f"Error running benchmark: {e}"