lingyit1108 commited on
Commit
23e06a5
·
1 Parent(s): e236b6c

added streamlit files under pages

Browse files
Files changed (6) hide show
  1. .gitignore +1 -2
  2. main.py +6 -0
  3. pages/1_Leaderboard.py +169 -0
  4. pages/2_Evaluations.py +491 -0
  5. pages/3_app.py +11 -0
  6. streamlit_app.py +5 -2
.gitignore CHANGED
@@ -4,5 +4,4 @@
4
  results/
5
 
6
  *.sqlite
7
- ux/
8
- pages/
 
4
  results/
5
 
6
  *.sqlite
7
+ ux/
 
main.py CHANGED
@@ -30,9 +30,13 @@ def main():
30
 
31
  ### gpt-4-1106-preview
32
  ### gpt-3.5-turbo-1106 / gpt-3.5-turbo
 
33
  llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)
 
 
34
  embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
35
 
 
36
  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
37
  index = VectorStoreIndex.from_documents([document], service_context=service_context)
38
 
@@ -56,6 +60,8 @@ def main():
56
 
57
  tru_recorder = get_prebuilt_trulens_recorder(query_engine,
58
  app_id="Direct Query Engine")
 
 
59
  with tru_recorder as recording:
60
  for question in eval_questions:
61
  response = query_engine.query(question)
 
30
 
31
  ### gpt-4-1106-preview
32
  ### gpt-3.5-turbo-1106 / gpt-3.5-turbo
33
+ print("Initializing GPT 3.5 ..")
34
  llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)
35
+
36
+ print("Initializing bge-small-en-v1.5 embedding model ..")
37
  embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
38
 
39
+ print("Creating vector store ..")
40
  service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
41
  index = VectorStoreIndex.from_documents([document], service_context=service_context)
42
 
 
60
 
61
  tru_recorder = get_prebuilt_trulens_recorder(query_engine,
62
  app_id="Direct Query Engine")
63
+
64
+ print("Sending each question to llm ..")
65
  with tru_recorder as recording:
66
  for question in eval_questions:
67
  response = query_engine.query(question)
pages/1_Leaderboard.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import math
5
+ import sys
6
+
7
+ # https://github.com/jerryjliu/llama_index/issues/7244:
8
+ asyncio.set_event_loop(asyncio.new_event_loop())
9
+
10
+ from millify import millify
11
+ import numpy as np
12
+ import streamlit as st
13
+ from streamlit_extras.switch_page_button import switch_page
14
+
15
+ from trulens_eval.db_migration import MIGRATION_UNKNOWN_STR
16
+ from trulens_eval.ux.styles import CATEGORY
17
+
18
+ st.runtime.legacy_caching.clear_cache()
19
+
20
+ from trulens_eval import Tru
21
+ from trulens_eval.ux import styles
22
+ from trulens_eval.ux.components import draw_metadata
23
+
24
+ st.set_page_config(page_title="Leaderboard", layout="wide")
25
+
26
+ from trulens_eval.ux.add_logo import add_logo_and_style_overrides
27
+
28
+ add_logo_and_style_overrides()
29
+
30
+ database_url = None
31
+
32
+
33
+ def streamlit_app():
34
+ tru = Tru(database_url=database_url)
35
+ lms = tru.db
36
+
37
+ # Set the title and subtitle of the app
38
+ st.title("App Leaderboard")
39
+ st.write(
40
+ "Average feedback values displayed in the range from 0 (worst) to 1 (best)."
41
+ )
42
+ df, feedback_col_names = lms.get_records_and_feedback([])
43
+ feedback_defs = lms.get_feedback_defs()
44
+ feedback_directions = {
45
+ (
46
+ row.feedback_json.get("supplied_name", "") or
47
+ row.feedback_json["implementation"]["name"]
48
+ ): row.feedback_json.get("higher_is_better", True)
49
+ for _, row in feedback_defs.iterrows()
50
+ }
51
+
52
+ if df.empty:
53
+ st.write("No records yet...")
54
+ return
55
+
56
+ df = df.sort_values(by="app_id")
57
+
58
+ if df.empty:
59
+ st.write("No records yet...")
60
+
61
+ apps = list(df.app_id.unique())
62
+ st.markdown("""---""")
63
+
64
+ for app in apps:
65
+ app_df = df.loc[df.app_id == app]
66
+ if app_df.empty:
67
+ continue
68
+ app_str = app_df["app_json"].iloc[0]
69
+ app_json = json.loads(app_str)
70
+ metadata = app_json.get("metadata")
71
+ # st.text('Metadata' + str(metadata))
72
+ st.header(app, help=draw_metadata(metadata))
73
+ app_feedback_col_names = [
74
+ col_name for col_name in feedback_col_names
75
+ if not app_df[col_name].isna().all()
76
+ ]
77
+ col1, col2, col3, col4, *feedback_cols, col99 = st.columns(
78
+ 5 + len(app_feedback_col_names)
79
+ )
80
+ latency_mean = (
81
+ app_df["latency"].
82
+ apply(lambda td: td if td != MIGRATION_UNKNOWN_STR else None).mean()
83
+ )
84
+
85
+ # app_df_feedback = df.loc[df.app_id == app]
86
+
87
+ col1.metric("Records", len(app_df))
88
+ col2.metric(
89
+ "Average Latency (Seconds)",
90
+ (
91
+ f"{millify(round(latency_mean, 5), precision=2)}"
92
+ if not math.isnan(latency_mean) else "nan"
93
+ ),
94
+ )
95
+ col3.metric(
96
+ "Total Cost (USD)",
97
+ f"${millify(round(sum(cost for cost in app_df.total_cost if cost is not None), 5), precision = 2)}",
98
+ )
99
+ col4.metric(
100
+ "Total Tokens",
101
+ millify(
102
+ sum(
103
+ tokens for tokens in app_df.total_tokens
104
+ if tokens is not None
105
+ ),
106
+ precision=2
107
+ ),
108
+ )
109
+
110
+ for i, col_name in enumerate(app_feedback_col_names):
111
+ mean = app_df[col_name].mean()
112
+
113
+ st.write(
114
+ styles.stmetricdelta_hidearrow,
115
+ unsafe_allow_html=True,
116
+ )
117
+
118
+ higher_is_better = feedback_directions.get(col_name, True)
119
+
120
+ if "distance" in col_name:
121
+ feedback_cols[i].metric(
122
+ label=col_name,
123
+ value=f"{round(mean, 2)}",
124
+ delta_color="normal"
125
+ )
126
+ else:
127
+ cat = CATEGORY.of_score(mean, higher_is_better=higher_is_better)
128
+ feedback_cols[i].metric(
129
+ label=col_name,
130
+ value=f"{round(mean, 2)}",
131
+ delta=f"{cat.icon} {cat.adjective}",
132
+ delta_color=(
133
+ "normal" if cat.compare(
134
+ mean, CATEGORY.PASS[cat.direction].threshold
135
+ ) else "inverse"
136
+ ),
137
+ )
138
+
139
+ with col99:
140
+ if st.button("Select App", key=f"app-selector-{app}"):
141
+ st.session_state.app = app
142
+ switch_page("Evaluations")
143
+
144
+ # with st.expander("Model metadata"):
145
+ # st.markdown(draw_metadata(metadata))
146
+
147
+ st.markdown("""---""")
148
+
149
+
150
+ # Define the main function to run the app
151
+ def main():
152
+ streamlit_app()
153
+
154
+
155
+ if __name__ == "__main__":
156
+ parser = argparse.ArgumentParser()
157
+ parser.add_argument("--database-url", default=None)
158
+
159
+ try:
160
+ args = parser.parse_args()
161
+ except SystemExit as e:
162
+ # This exception will be raised if --help or invalid command line arguments
163
+ # are used. Currently, streamlit prevents the program from exiting normally,
164
+ # so we have to do a hard exit.
165
+ sys.exit(e.code)
166
+
167
+ database_url = args.database_url
168
+
169
+ main()
pages/2_Evaluations.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from typing import Iterable, Tuple
4
+
5
+ # https://github.com/jerryjliu/llama_index/issues/7244:
6
+ asyncio.set_event_loop(asyncio.new_event_loop())
7
+
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import pandas as pd
11
+ from st_aggrid import AgGrid
12
+ from st_aggrid.grid_options_builder import GridOptionsBuilder
13
+ from st_aggrid.shared import GridUpdateMode
14
+ from st_aggrid.shared import JsCode
15
+ import streamlit as st
16
+ from ux.add_logo import add_logo_and_style_overrides
17
+ from ux.styles import CATEGORY
18
+
19
+ from trulens_eval import Tru
20
+ from trulens_eval.app import Agent
21
+ from trulens_eval.app import ComponentView
22
+ from trulens_eval.app import instrumented_component_views
23
+ from trulens_eval.app import LLM
24
+ from trulens_eval.app import Other
25
+ from trulens_eval.app import Prompt
26
+ from trulens_eval.app import Tool
27
+ from trulens_eval.db import MULTI_CALL_NAME_DELIMITER
28
+ from trulens_eval.react_components.record_viewer import record_viewer
29
+ from trulens_eval.schema import Record
30
+ from trulens_eval.schema import Select
31
+ from trulens_eval.utils.json import jsonify_for_ui
32
+ from trulens_eval.utils.serial import Lens
33
+ from trulens_eval.ux.components import draw_agent_info
34
+ from trulens_eval.ux.components import draw_call
35
+ from trulens_eval.ux.components import draw_llm_info
36
+ from trulens_eval.ux.components import draw_metadata
37
+ from trulens_eval.ux.components import draw_prompt_info
38
+ from trulens_eval.ux.components import draw_tool_info
39
+ from trulens_eval.ux.components import render_selector_markdown
40
+ from trulens_eval.ux.components import write_or_json
41
+ from trulens_eval.ux.styles import cellstyle_jscode
42
+
43
+ st.set_page_config(page_title="Evaluations", layout="wide")
44
+
45
+ st.title("Evaluations")
46
+
47
+ st.runtime.legacy_caching.clear_cache()
48
+
49
+ add_logo_and_style_overrides()
50
+
51
+ tru = Tru()
52
+ lms = tru.db
53
+
54
+ df_results, feedback_cols = lms.get_records_and_feedback([])
55
+
56
+ # TODO: remove code redundancy / redundant database calls
57
+ feedback_directions = {
58
+ (
59
+ row.feedback_json.get("supplied_name", "") or
60
+ row.feedback_json["implementation"]["name"]
61
+ ): (
62
+ "HIGHER_IS_BETTER" if row.feedback_json.get("higher_is_better", True)
63
+ else "LOWER_IS_BETTER"
64
+ ) for _, row in lms.get_feedback_defs().iterrows()
65
+ }
66
+ default_direction = "HIGHER_IS_BETTER"
67
+
68
+
69
+ def render_component(query, component, header=True):
70
+ # Draw the accessor/path within the wrapped app of the component.
71
+ if header:
72
+ st.markdown(
73
+ f"##### Component {render_selector_markdown(Select.for_app(query))}"
74
+ )
75
+
76
+ # Draw the python class information of this component.
77
+ cls = component.cls
78
+ base_cls = cls.base_class()
79
+ label = f"__{repr(cls)}__"
80
+ if str(base_cls) != str(cls):
81
+ label += f" < __{repr(base_cls)}__"
82
+ st.write("Python class: " + label)
83
+
84
+ # Per-component-type drawing routines.
85
+ if isinstance(component, LLM):
86
+ draw_llm_info(component=component, query=query)
87
+
88
+ elif isinstance(component, Prompt):
89
+ draw_prompt_info(component=component, query=query)
90
+
91
+ elif isinstance(component, Agent):
92
+ draw_agent_info(component=component, query=query)
93
+
94
+ elif isinstance(component, Tool):
95
+ draw_tool_info(component=component, query=query)
96
+
97
+ elif isinstance(component, Other):
98
+ with st.expander("Uncategorized Component Details:"):
99
+ st.json(jsonify_for_ui(component.json))
100
+
101
+ else:
102
+ with st.expander("Unhandled Component Details:"):
103
+ st.json(jsonify_for_ui(component.json))
104
+
105
+
106
+ # Renders record level metrics (e.g. total tokens, cost, latency) compared to the average when appropriate
107
+ def render_record_metrics(app_df: pd.DataFrame, selected_rows: pd.DataFrame):
108
+ app_specific_df = app_df[app_df["app_id"] == selected_rows["app_id"][0]]
109
+
110
+ token_col, cost_col, latency_col = st.columns(3)
111
+
112
+ num_tokens = selected_rows["total_tokens"][0]
113
+ token_col.metric(label="Total tokens (#)", value=num_tokens)
114
+
115
+ cost = selected_rows["total_cost"][0]
116
+ average_cost = app_specific_df["total_cost"].mean()
117
+ delta_cost = "{:.3g}".format(cost - average_cost)
118
+ cost_col.metric(
119
+ label="Total cost (USD)",
120
+ value=selected_rows["total_cost"][0],
121
+ delta=delta_cost,
122
+ delta_color="inverse",
123
+ )
124
+
125
+ latency = selected_rows["latency"][0]
126
+ average_latency = app_specific_df["latency"].mean()
127
+ delta_latency = "{:.3g}s".format(latency - average_latency)
128
+ latency_col.metric(
129
+ label="Latency (s)",
130
+ value=selected_rows["latency"][0],
131
+ delta=delta_latency,
132
+ delta_color="inverse",
133
+ )
134
+
135
+
136
+ if df_results.empty:
137
+ st.write("No records yet...")
138
+
139
+ else:
140
+ apps = list(df_results.app_id.unique())
141
+ if "app" in st.session_state:
142
+ app = st.session_state.app
143
+ else:
144
+ app = apps
145
+
146
+ st.experimental_set_query_params(app=app)
147
+
148
+ options = st.multiselect("Filter Applications", apps, default=app)
149
+
150
+ if len(options) == 0:
151
+ st.header("All Applications")
152
+ app_df = df_results
153
+
154
+ elif len(options) == 1:
155
+ st.header(options[0])
156
+
157
+ app_df = df_results[df_results.app_id.isin(options)]
158
+
159
+ else:
160
+ st.header("Multiple Applications Selected")
161
+
162
+ app_df = df_results[df_results.app_id.isin(options)]
163
+
164
+ tab1, tab2 = st.tabs(["Records", "Feedback Functions"])
165
+
166
+ with tab1:
167
+ gridOptions = {"alwaysShowHorizontalScroll": True}
168
+ evaluations_df = app_df
169
+
170
+ # By default the cells in the df are unicode-escaped, so we have to reverse it.
171
+ input_array = evaluations_df['input'].to_numpy()
172
+ output_array = evaluations_df['output'].to_numpy()
173
+
174
+ decoded_input = np.vectorize(
175
+ lambda x: x.encode('utf-8').decode('unicode-escape')
176
+ )(input_array)
177
+ decoded_output = np.vectorize(
178
+ lambda x: x.encode('utf-8').decode('unicode-escape')
179
+ )(output_array)
180
+
181
+ evaluations_df['input'] = decoded_input
182
+ evaluations_df['output'] = decoded_output
183
+
184
+ gb = GridOptionsBuilder.from_dataframe(evaluations_df)
185
+
186
+ gb.configure_column("type", header_name="App Type")
187
+ gb.configure_column("record_json", header_name="Record JSON", hide=True)
188
+ gb.configure_column("app_json", header_name="App JSON", hide=True)
189
+ gb.configure_column("cost_json", header_name="Cost JSON", hide=True)
190
+ gb.configure_column("perf_json", header_name="Perf. JSON", hide=True)
191
+
192
+ gb.configure_column("record_id", header_name="Record ID", hide=True)
193
+ gb.configure_column("app_id", header_name="App ID")
194
+
195
+ gb.configure_column("feedback_id", header_name="Feedback ID", hide=True)
196
+ gb.configure_column("input", header_name="User Input")
197
+ gb.configure_column(
198
+ "output",
199
+ header_name="Response",
200
+ )
201
+ gb.configure_column("total_tokens", header_name="Total Tokens (#)")
202
+ gb.configure_column("total_cost", header_name="Total Cost (USD)")
203
+ gb.configure_column("latency", header_name="Latency (Seconds)")
204
+ gb.configure_column("tags", header_name="Tags")
205
+ gb.configure_column("ts", header_name="Time Stamp", sort="desc")
206
+
207
+ non_feedback_cols = [
208
+ "app_id",
209
+ "type",
210
+ "ts",
211
+ "total_tokens",
212
+ "total_cost",
213
+ "record_json",
214
+ "latency",
215
+ "record_id",
216
+ "app_id",
217
+ "cost_json",
218
+ "app_json",
219
+ "input",
220
+ "output",
221
+ "perf_json",
222
+ ]
223
+
224
+ for feedback_col in evaluations_df.columns.drop(non_feedback_cols):
225
+ if "distance" in feedback_col:
226
+ gb.configure_column(
227
+ feedback_col, hide=feedback_col.endswith("_calls")
228
+ )
229
+ else:
230
+ # cell highlight depending on feedback direction
231
+ cellstyle = JsCode(
232
+ cellstyle_jscode[feedback_directions.get(
233
+ feedback_col, default_direction
234
+ )]
235
+ )
236
+
237
+ gb.configure_column(
238
+ feedback_col,
239
+ cellStyle=cellstyle,
240
+ hide=feedback_col.endswith("_calls")
241
+ )
242
+
243
+ gb.configure_pagination()
244
+ gb.configure_side_bar()
245
+ gb.configure_selection(selection_mode="single", use_checkbox=False)
246
+ # gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc="sum", editable=True)
247
+ gridOptions = gb.build()
248
+ data = AgGrid(
249
+ evaluations_df,
250
+ gridOptions=gridOptions,
251
+ update_mode=GridUpdateMode.SELECTION_CHANGED,
252
+ allow_unsafe_jscode=True,
253
+ )
254
+
255
+ selected_rows = data["selected_rows"]
256
+ selected_rows = pd.DataFrame(selected_rows)
257
+
258
+ if len(selected_rows) == 0:
259
+ st.write("Hint: select a row to display details of a record")
260
+
261
+ else:
262
+ # Start the record specific section
263
+ st.divider()
264
+
265
+ # Breadcrumbs
266
+ st.caption(
267
+ f"{selected_rows['app_id'][0]} / {selected_rows['record_id'][0]}"
268
+ )
269
+ st.header(f"{selected_rows['record_id'][0]}")
270
+
271
+ render_record_metrics(app_df, selected_rows)
272
+
273
+ st.markdown("")
274
+
275
+ prompt = selected_rows["input"][0]
276
+ response = selected_rows["output"][0]
277
+ details = selected_rows["app_json"][0]
278
+
279
+ app_json = json.loads(
280
+ details
281
+ ) # apps may not be deserializable, don't try to, keep it json.
282
+
283
+ row = selected_rows.head().iloc[0]
284
+
285
+ # Display input/response side by side. In each column, we put them in tabs mainly for
286
+ # formatting/styling purposes.
287
+ input_col, response_col = st.columns(2)
288
+
289
+ (input_tab,) = input_col.tabs(["Input"])
290
+ with input_tab:
291
+ with st.expander(
292
+ f"Input {render_selector_markdown(Select.RecordInput)}",
293
+ expanded=True):
294
+ write_or_json(st, obj=prompt)
295
+
296
+ (response_tab,) = response_col.tabs(["Response"])
297
+ with response_tab:
298
+ with st.expander(
299
+ f"Response {render_selector_markdown(Select.RecordOutput)}",
300
+ expanded=True):
301
+ write_or_json(st, obj=response)
302
+
303
+ feedback_tab, metadata_tab = st.tabs(["Feedback", "Metadata"])
304
+
305
+ with metadata_tab:
306
+ metadata = app_json.get("metadata")
307
+ if metadata:
308
+ with st.expander("Metadata"):
309
+ st.markdown(draw_metadata(metadata))
310
+ else:
311
+ st.write("No metadata found")
312
+
313
+ with feedback_tab:
314
+ if len(feedback_cols) == 0:
315
+ st.write("No feedback details")
316
+
317
+ for fcol in feedback_cols:
318
+ feedback_name = fcol
319
+ feedback_result = row[fcol]
320
+ print(feedback_result)
321
+
322
+ if MULTI_CALL_NAME_DELIMITER in fcol:
323
+ fcol = fcol.split(MULTI_CALL_NAME_DELIMITER)[0]
324
+ feedback_calls = row[f"{fcol}_calls"]
325
+
326
+ def display_feedback_call(call):
327
+
328
+ def highlight(s):
329
+ if "distance" in feedback_name:
330
+ return [
331
+ f"background-color: {CATEGORY.UNKNOWN.color}"
332
+ ] * len(s)
333
+ cat = CATEGORY.of_score(
334
+ s.result,
335
+ higher_is_better=feedback_directions.get(
336
+ fcol, default_direction
337
+ ) == default_direction
338
+ )
339
+ return [f"background-color: {cat.color}"] * len(s)
340
+
341
+ if call is not None and len(call) > 0:
342
+ df = pd.DataFrame.from_records(
343
+ [call[i]["args"] for i in range(len(call))]
344
+ )
345
+ df["result"] = pd.DataFrame(
346
+ [
347
+ float(call[i]["ret"])
348
+ if call[i]["ret"] is not None else -1
349
+ for i in range(len(call))
350
+ ]
351
+ )
352
+ df["meta"] = pd.Series(
353
+ [call[i]["meta"] for i in range(len(call))]
354
+ )
355
+ df = df.join(df.meta.apply(lambda m: pd.Series(m))
356
+ ).drop(columns="meta")
357
+
358
+ st.dataframe(
359
+ df.style.apply(highlight, axis=1).format(
360
+ "{:.2}", subset=["result"]
361
+ )
362
+ )
363
+
364
+ else:
365
+ st.text("No feedback details.")
366
+
367
+ with st.expander(f"{feedback_name} = {feedback_result}",
368
+ expanded=True):
369
+ display_feedback_call(feedback_calls)
370
+
371
+ record_str = selected_rows["record_json"][0]
372
+ record_json = json.loads(record_str)
373
+ record = Record.model_validate(record_json)
374
+
375
+ classes: Iterable[Tuple[Lens, ComponentView]
376
+ ] = list(instrumented_component_views(app_json))
377
+ classes_map = {path: view for path, view in classes}
378
+
379
+ st.markdown("")
380
+ st.subheader("Timeline")
381
+ val = record_viewer(record_json, app_json)
382
+ st.markdown("")
383
+
384
+ match_query = None
385
+
386
+ # Assumes record_json['perf']['start_time'] is always present
387
+ if val != "":
388
+ match = None
389
+ for call in record.calls:
390
+ if call.perf.start_time.isoformat() == val:
391
+ match = call
392
+ break
393
+
394
+ if match:
395
+ length = len(match.stack)
396
+ app_call = match.stack[length - 1]
397
+
398
+ match_query = match.top().path
399
+
400
+ st.subheader(
401
+ f"{app_call.method.obj.cls.name} {render_selector_markdown(Select.for_app(match_query))}"
402
+ )
403
+
404
+ draw_call(match)
405
+
406
+ view = classes_map.get(match_query)
407
+ if view is not None:
408
+ render_component(
409
+ query=match_query, component=view, header=False
410
+ )
411
+ else:
412
+ st.write(
413
+ f"Call by `{match_query}` was not associated with any instrumented"
414
+ " component."
415
+ )
416
+ # Look up whether there was any data at that path even if not an instrumented component:
417
+
418
+ try:
419
+ app_component_json = list(
420
+ match_query.get(app_json)
421
+ )[0]
422
+ if app_component_json is not None:
423
+ with st.expander(
424
+ "Uninstrumented app component details."
425
+ ):
426
+ st.json(app_component_json)
427
+ except Exception:
428
+ st.write(
429
+ f"Recorded invocation by component `{match_query}` but cannot find this component in the app json."
430
+ )
431
+
432
+ else:
433
+ st.text("No match found")
434
+ else:
435
+ st.subheader(f"App {render_selector_markdown(Select.App)}")
436
+ with st.expander("App Details:"):
437
+ st.json(jsonify_for_ui(app_json))
438
+
439
+ if match_query is not None:
440
+ container = st.empty()
441
+
442
+ has_subcomponents = False
443
+ for query, component in classes:
444
+ if not match_query.is_immediate_prefix_of(query):
445
+ continue
446
+
447
+ if len(query.path) == 0:
448
+ # Skip App, will still list App.app under "app".
449
+ continue
450
+
451
+ has_subcomponents = True
452
+ render_component(query, component)
453
+
454
+ if has_subcomponents:
455
+ container.markdown("#### Subcomponents:")
456
+
457
+ st.header("More options:")
458
+
459
+ if st.button("Display full app json"):
460
+ st.write(jsonify_for_ui(app_json))
461
+
462
+ if st.button("Display full record json"):
463
+ st.write(jsonify_for_ui(record_json))
464
+
465
+ with tab2:
466
+ feedback = feedback_cols
467
+ cols = 4
468
+ rows = len(feedback) // cols + 1
469
+
470
+ for row_num in range(rows):
471
+ with st.container():
472
+ columns = st.columns(cols)
473
+ for col_num in range(cols):
474
+ with columns[col_num]:
475
+ ind = row_num * cols + col_num
476
+ if ind < len(feedback):
477
+ # Generate histogram
478
+ fig, ax = plt.subplots()
479
+ bins = [
480
+ 0, 0.2, 0.4, 0.6, 0.8, 1.0
481
+ ] # Quintile buckets
482
+ ax.hist(
483
+ app_df[feedback[ind]],
484
+ bins=bins,
485
+ edgecolor="black",
486
+ color="#2D736D"
487
+ )
488
+ ax.set_xlabel("Feedback Value")
489
+ ax.set_ylabel("Frequency")
490
+ ax.set_title(feedback[ind], loc="center")
491
+ st.pyplot(fig)
pages/3_app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+
4
+ try:
5
+ raw_docs_files = ", ".join(os.listdir("./raw_documents"))
6
+ curr_directory_files = ", ".join(os.listdir("."))
7
+ file_ls_str = raw_docs_files + "\n\n" + curr_directory_files
8
+ except:
9
+ file_ls_str = "NA"
10
+
11
+ st.write(f"Hello World! File list: {file_ls_str}")
streamlit_app.py CHANGED
@@ -22,8 +22,11 @@ evaluation_path = pkg_resources.resource_filename(
22
  ux_path = pkg_resources.resource_filename(
23
  "trulens_eval", "ux"
24
  )
25
- shutil.copyfile(leaderboard_path, os.path.join("pages", "1_Leaderboard.py"))
26
- shutil.copyfile(evaluation_path, os.path.join("pages", "2_Evaluations.py"))
 
 
 
27
  if os.path.exists("./ux"):
28
  shutil.rmtree("./ux")
29
  shutil.copytree(ux_path, "./ux")
 
22
  ux_path = pkg_resources.resource_filename(
23
  "trulens_eval", "ux"
24
  )
25
+
26
+ os.makedirs("./pages", exist_ok=True)
27
+ shutil.copyfile(leaderboard_path, os.path.join("./pages", "1_Leaderboard.py"))
28
+ shutil.copyfile(evaluation_path, os.path.join("./pages", "2_Evaluations.py"))
29
+
30
  if os.path.exists("./ux"):
31
  shutil.rmtree("./ux")
32
  shutil.copytree(ux_path, "./ux")