Spaces:
Running
Running
lingyit1108
commited on
Commit
·
23e06a5
1
Parent(s):
e236b6c
added streamlit files under pages
Browse files- .gitignore +1 -2
- main.py +6 -0
- pages/1_Leaderboard.py +169 -0
- pages/2_Evaluations.py +491 -0
- pages/3_app.py +11 -0
- streamlit_app.py +5 -2
.gitignore
CHANGED
@@ -4,5 +4,4 @@
|
|
4 |
results/
|
5 |
|
6 |
*.sqlite
|
7 |
-
ux/
|
8 |
-
pages/
|
|
|
4 |
results/
|
5 |
|
6 |
*.sqlite
|
7 |
+
ux/
|
|
main.py
CHANGED
@@ -30,9 +30,13 @@ def main():
|
|
30 |
|
31 |
### gpt-4-1106-preview
|
32 |
### gpt-3.5-turbo-1106 / gpt-3.5-turbo
|
|
|
33 |
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)
|
|
|
|
|
34 |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
35 |
|
|
|
36 |
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
|
37 |
index = VectorStoreIndex.from_documents([document], service_context=service_context)
|
38 |
|
@@ -56,6 +60,8 @@ def main():
|
|
56 |
|
57 |
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
|
58 |
app_id="Direct Query Engine")
|
|
|
|
|
59 |
with tru_recorder as recording:
|
60 |
for question in eval_questions:
|
61 |
response = query_engine.query(question)
|
|
|
30 |
|
31 |
### gpt-4-1106-preview
|
32 |
### gpt-3.5-turbo-1106 / gpt-3.5-turbo
|
33 |
+
print("Initializing GPT 3.5 ..")
|
34 |
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.1)
|
35 |
+
|
36 |
+
print("Initializing bge-small-en-v1.5 embedding model ..")
|
37 |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
38 |
|
39 |
+
print("Creating vector store ..")
|
40 |
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
|
41 |
index = VectorStoreIndex.from_documents([document], service_context=service_context)
|
42 |
|
|
|
60 |
|
61 |
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
|
62 |
app_id="Direct Query Engine")
|
63 |
+
|
64 |
+
print("Sending each question to llm ..")
|
65 |
with tru_recorder as recording:
|
66 |
for question in eval_questions:
|
67 |
response = query_engine.query(question)
|
pages/1_Leaderboard.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import asyncio
|
3 |
+
import json
|
4 |
+
import math
|
5 |
+
import sys
|
6 |
+
|
7 |
+
# https://github.com/jerryjliu/llama_index/issues/7244:
|
8 |
+
asyncio.set_event_loop(asyncio.new_event_loop())
|
9 |
+
|
10 |
+
from millify import millify
|
11 |
+
import numpy as np
|
12 |
+
import streamlit as st
|
13 |
+
from streamlit_extras.switch_page_button import switch_page
|
14 |
+
|
15 |
+
from trulens_eval.db_migration import MIGRATION_UNKNOWN_STR
|
16 |
+
from trulens_eval.ux.styles import CATEGORY
|
17 |
+
|
18 |
+
st.runtime.legacy_caching.clear_cache()
|
19 |
+
|
20 |
+
from trulens_eval import Tru
|
21 |
+
from trulens_eval.ux import styles
|
22 |
+
from trulens_eval.ux.components import draw_metadata
|
23 |
+
|
24 |
+
st.set_page_config(page_title="Leaderboard", layout="wide")
|
25 |
+
|
26 |
+
from trulens_eval.ux.add_logo import add_logo_and_style_overrides
|
27 |
+
|
28 |
+
add_logo_and_style_overrides()
|
29 |
+
|
30 |
+
database_url = None
|
31 |
+
|
32 |
+
|
33 |
+
def streamlit_app():
|
34 |
+
tru = Tru(database_url=database_url)
|
35 |
+
lms = tru.db
|
36 |
+
|
37 |
+
# Set the title and subtitle of the app
|
38 |
+
st.title("App Leaderboard")
|
39 |
+
st.write(
|
40 |
+
"Average feedback values displayed in the range from 0 (worst) to 1 (best)."
|
41 |
+
)
|
42 |
+
df, feedback_col_names = lms.get_records_and_feedback([])
|
43 |
+
feedback_defs = lms.get_feedback_defs()
|
44 |
+
feedback_directions = {
|
45 |
+
(
|
46 |
+
row.feedback_json.get("supplied_name", "") or
|
47 |
+
row.feedback_json["implementation"]["name"]
|
48 |
+
): row.feedback_json.get("higher_is_better", True)
|
49 |
+
for _, row in feedback_defs.iterrows()
|
50 |
+
}
|
51 |
+
|
52 |
+
if df.empty:
|
53 |
+
st.write("No records yet...")
|
54 |
+
return
|
55 |
+
|
56 |
+
df = df.sort_values(by="app_id")
|
57 |
+
|
58 |
+
if df.empty:
|
59 |
+
st.write("No records yet...")
|
60 |
+
|
61 |
+
apps = list(df.app_id.unique())
|
62 |
+
st.markdown("""---""")
|
63 |
+
|
64 |
+
for app in apps:
|
65 |
+
app_df = df.loc[df.app_id == app]
|
66 |
+
if app_df.empty:
|
67 |
+
continue
|
68 |
+
app_str = app_df["app_json"].iloc[0]
|
69 |
+
app_json = json.loads(app_str)
|
70 |
+
metadata = app_json.get("metadata")
|
71 |
+
# st.text('Metadata' + str(metadata))
|
72 |
+
st.header(app, help=draw_metadata(metadata))
|
73 |
+
app_feedback_col_names = [
|
74 |
+
col_name for col_name in feedback_col_names
|
75 |
+
if not app_df[col_name].isna().all()
|
76 |
+
]
|
77 |
+
col1, col2, col3, col4, *feedback_cols, col99 = st.columns(
|
78 |
+
5 + len(app_feedback_col_names)
|
79 |
+
)
|
80 |
+
latency_mean = (
|
81 |
+
app_df["latency"].
|
82 |
+
apply(lambda td: td if td != MIGRATION_UNKNOWN_STR else None).mean()
|
83 |
+
)
|
84 |
+
|
85 |
+
# app_df_feedback = df.loc[df.app_id == app]
|
86 |
+
|
87 |
+
col1.metric("Records", len(app_df))
|
88 |
+
col2.metric(
|
89 |
+
"Average Latency (Seconds)",
|
90 |
+
(
|
91 |
+
f"{millify(round(latency_mean, 5), precision=2)}"
|
92 |
+
if not math.isnan(latency_mean) else "nan"
|
93 |
+
),
|
94 |
+
)
|
95 |
+
col3.metric(
|
96 |
+
"Total Cost (USD)",
|
97 |
+
f"${millify(round(sum(cost for cost in app_df.total_cost if cost is not None), 5), precision = 2)}",
|
98 |
+
)
|
99 |
+
col4.metric(
|
100 |
+
"Total Tokens",
|
101 |
+
millify(
|
102 |
+
sum(
|
103 |
+
tokens for tokens in app_df.total_tokens
|
104 |
+
if tokens is not None
|
105 |
+
),
|
106 |
+
precision=2
|
107 |
+
),
|
108 |
+
)
|
109 |
+
|
110 |
+
for i, col_name in enumerate(app_feedback_col_names):
|
111 |
+
mean = app_df[col_name].mean()
|
112 |
+
|
113 |
+
st.write(
|
114 |
+
styles.stmetricdelta_hidearrow,
|
115 |
+
unsafe_allow_html=True,
|
116 |
+
)
|
117 |
+
|
118 |
+
higher_is_better = feedback_directions.get(col_name, True)
|
119 |
+
|
120 |
+
if "distance" in col_name:
|
121 |
+
feedback_cols[i].metric(
|
122 |
+
label=col_name,
|
123 |
+
value=f"{round(mean, 2)}",
|
124 |
+
delta_color="normal"
|
125 |
+
)
|
126 |
+
else:
|
127 |
+
cat = CATEGORY.of_score(mean, higher_is_better=higher_is_better)
|
128 |
+
feedback_cols[i].metric(
|
129 |
+
label=col_name,
|
130 |
+
value=f"{round(mean, 2)}",
|
131 |
+
delta=f"{cat.icon} {cat.adjective}",
|
132 |
+
delta_color=(
|
133 |
+
"normal" if cat.compare(
|
134 |
+
mean, CATEGORY.PASS[cat.direction].threshold
|
135 |
+
) else "inverse"
|
136 |
+
),
|
137 |
+
)
|
138 |
+
|
139 |
+
with col99:
|
140 |
+
if st.button("Select App", key=f"app-selector-{app}"):
|
141 |
+
st.session_state.app = app
|
142 |
+
switch_page("Evaluations")
|
143 |
+
|
144 |
+
# with st.expander("Model metadata"):
|
145 |
+
# st.markdown(draw_metadata(metadata))
|
146 |
+
|
147 |
+
st.markdown("""---""")
|
148 |
+
|
149 |
+
|
150 |
+
# Define the main function to run the app
|
151 |
+
def main():
|
152 |
+
streamlit_app()
|
153 |
+
|
154 |
+
|
155 |
+
if __name__ == "__main__":
|
156 |
+
parser = argparse.ArgumentParser()
|
157 |
+
parser.add_argument("--database-url", default=None)
|
158 |
+
|
159 |
+
try:
|
160 |
+
args = parser.parse_args()
|
161 |
+
except SystemExit as e:
|
162 |
+
# This exception will be raised if --help or invalid command line arguments
|
163 |
+
# are used. Currently, streamlit prevents the program from exiting normally,
|
164 |
+
# so we have to do a hard exit.
|
165 |
+
sys.exit(e.code)
|
166 |
+
|
167 |
+
database_url = args.database_url
|
168 |
+
|
169 |
+
main()
|
pages/2_Evaluations.py
ADDED
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import json
|
3 |
+
from typing import Iterable, Tuple
|
4 |
+
|
5 |
+
# https://github.com/jerryjliu/llama_index/issues/7244:
|
6 |
+
asyncio.set_event_loop(asyncio.new_event_loop())
|
7 |
+
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import numpy as np
|
10 |
+
import pandas as pd
|
11 |
+
from st_aggrid import AgGrid
|
12 |
+
from st_aggrid.grid_options_builder import GridOptionsBuilder
|
13 |
+
from st_aggrid.shared import GridUpdateMode
|
14 |
+
from st_aggrid.shared import JsCode
|
15 |
+
import streamlit as st
|
16 |
+
from ux.add_logo import add_logo_and_style_overrides
|
17 |
+
from ux.styles import CATEGORY
|
18 |
+
|
19 |
+
from trulens_eval import Tru
|
20 |
+
from trulens_eval.app import Agent
|
21 |
+
from trulens_eval.app import ComponentView
|
22 |
+
from trulens_eval.app import instrumented_component_views
|
23 |
+
from trulens_eval.app import LLM
|
24 |
+
from trulens_eval.app import Other
|
25 |
+
from trulens_eval.app import Prompt
|
26 |
+
from trulens_eval.app import Tool
|
27 |
+
from trulens_eval.db import MULTI_CALL_NAME_DELIMITER
|
28 |
+
from trulens_eval.react_components.record_viewer import record_viewer
|
29 |
+
from trulens_eval.schema import Record
|
30 |
+
from trulens_eval.schema import Select
|
31 |
+
from trulens_eval.utils.json import jsonify_for_ui
|
32 |
+
from trulens_eval.utils.serial import Lens
|
33 |
+
from trulens_eval.ux.components import draw_agent_info
|
34 |
+
from trulens_eval.ux.components import draw_call
|
35 |
+
from trulens_eval.ux.components import draw_llm_info
|
36 |
+
from trulens_eval.ux.components import draw_metadata
|
37 |
+
from trulens_eval.ux.components import draw_prompt_info
|
38 |
+
from trulens_eval.ux.components import draw_tool_info
|
39 |
+
from trulens_eval.ux.components import render_selector_markdown
|
40 |
+
from trulens_eval.ux.components import write_or_json
|
41 |
+
from trulens_eval.ux.styles import cellstyle_jscode
|
42 |
+
|
43 |
+
st.set_page_config(page_title="Evaluations", layout="wide")
|
44 |
+
|
45 |
+
st.title("Evaluations")
|
46 |
+
|
47 |
+
st.runtime.legacy_caching.clear_cache()
|
48 |
+
|
49 |
+
add_logo_and_style_overrides()
|
50 |
+
|
51 |
+
tru = Tru()
|
52 |
+
lms = tru.db
|
53 |
+
|
54 |
+
df_results, feedback_cols = lms.get_records_and_feedback([])
|
55 |
+
|
56 |
+
# TODO: remove code redundancy / redundant database calls
|
57 |
+
feedback_directions = {
|
58 |
+
(
|
59 |
+
row.feedback_json.get("supplied_name", "") or
|
60 |
+
row.feedback_json["implementation"]["name"]
|
61 |
+
): (
|
62 |
+
"HIGHER_IS_BETTER" if row.feedback_json.get("higher_is_better", True)
|
63 |
+
else "LOWER_IS_BETTER"
|
64 |
+
) for _, row in lms.get_feedback_defs().iterrows()
|
65 |
+
}
|
66 |
+
default_direction = "HIGHER_IS_BETTER"
|
67 |
+
|
68 |
+
|
69 |
+
def render_component(query, component, header=True):
|
70 |
+
# Draw the accessor/path within the wrapped app of the component.
|
71 |
+
if header:
|
72 |
+
st.markdown(
|
73 |
+
f"##### Component {render_selector_markdown(Select.for_app(query))}"
|
74 |
+
)
|
75 |
+
|
76 |
+
# Draw the python class information of this component.
|
77 |
+
cls = component.cls
|
78 |
+
base_cls = cls.base_class()
|
79 |
+
label = f"__{repr(cls)}__"
|
80 |
+
if str(base_cls) != str(cls):
|
81 |
+
label += f" < __{repr(base_cls)}__"
|
82 |
+
st.write("Python class: " + label)
|
83 |
+
|
84 |
+
# Per-component-type drawing routines.
|
85 |
+
if isinstance(component, LLM):
|
86 |
+
draw_llm_info(component=component, query=query)
|
87 |
+
|
88 |
+
elif isinstance(component, Prompt):
|
89 |
+
draw_prompt_info(component=component, query=query)
|
90 |
+
|
91 |
+
elif isinstance(component, Agent):
|
92 |
+
draw_agent_info(component=component, query=query)
|
93 |
+
|
94 |
+
elif isinstance(component, Tool):
|
95 |
+
draw_tool_info(component=component, query=query)
|
96 |
+
|
97 |
+
elif isinstance(component, Other):
|
98 |
+
with st.expander("Uncategorized Component Details:"):
|
99 |
+
st.json(jsonify_for_ui(component.json))
|
100 |
+
|
101 |
+
else:
|
102 |
+
with st.expander("Unhandled Component Details:"):
|
103 |
+
st.json(jsonify_for_ui(component.json))
|
104 |
+
|
105 |
+
|
106 |
+
# Renders record level metrics (e.g. total tokens, cost, latency) compared to the average when appropriate
|
107 |
+
def render_record_metrics(app_df: pd.DataFrame, selected_rows: pd.DataFrame):
|
108 |
+
app_specific_df = app_df[app_df["app_id"] == selected_rows["app_id"][0]]
|
109 |
+
|
110 |
+
token_col, cost_col, latency_col = st.columns(3)
|
111 |
+
|
112 |
+
num_tokens = selected_rows["total_tokens"][0]
|
113 |
+
token_col.metric(label="Total tokens (#)", value=num_tokens)
|
114 |
+
|
115 |
+
cost = selected_rows["total_cost"][0]
|
116 |
+
average_cost = app_specific_df["total_cost"].mean()
|
117 |
+
delta_cost = "{:.3g}".format(cost - average_cost)
|
118 |
+
cost_col.metric(
|
119 |
+
label="Total cost (USD)",
|
120 |
+
value=selected_rows["total_cost"][0],
|
121 |
+
delta=delta_cost,
|
122 |
+
delta_color="inverse",
|
123 |
+
)
|
124 |
+
|
125 |
+
latency = selected_rows["latency"][0]
|
126 |
+
average_latency = app_specific_df["latency"].mean()
|
127 |
+
delta_latency = "{:.3g}s".format(latency - average_latency)
|
128 |
+
latency_col.metric(
|
129 |
+
label="Latency (s)",
|
130 |
+
value=selected_rows["latency"][0],
|
131 |
+
delta=delta_latency,
|
132 |
+
delta_color="inverse",
|
133 |
+
)
|
134 |
+
|
135 |
+
|
136 |
+
if df_results.empty:
|
137 |
+
st.write("No records yet...")
|
138 |
+
|
139 |
+
else:
|
140 |
+
apps = list(df_results.app_id.unique())
|
141 |
+
if "app" in st.session_state:
|
142 |
+
app = st.session_state.app
|
143 |
+
else:
|
144 |
+
app = apps
|
145 |
+
|
146 |
+
st.experimental_set_query_params(app=app)
|
147 |
+
|
148 |
+
options = st.multiselect("Filter Applications", apps, default=app)
|
149 |
+
|
150 |
+
if len(options) == 0:
|
151 |
+
st.header("All Applications")
|
152 |
+
app_df = df_results
|
153 |
+
|
154 |
+
elif len(options) == 1:
|
155 |
+
st.header(options[0])
|
156 |
+
|
157 |
+
app_df = df_results[df_results.app_id.isin(options)]
|
158 |
+
|
159 |
+
else:
|
160 |
+
st.header("Multiple Applications Selected")
|
161 |
+
|
162 |
+
app_df = df_results[df_results.app_id.isin(options)]
|
163 |
+
|
164 |
+
tab1, tab2 = st.tabs(["Records", "Feedback Functions"])
|
165 |
+
|
166 |
+
with tab1:
|
167 |
+
gridOptions = {"alwaysShowHorizontalScroll": True}
|
168 |
+
evaluations_df = app_df
|
169 |
+
|
170 |
+
# By default the cells in the df are unicode-escaped, so we have to reverse it.
|
171 |
+
input_array = evaluations_df['input'].to_numpy()
|
172 |
+
output_array = evaluations_df['output'].to_numpy()
|
173 |
+
|
174 |
+
decoded_input = np.vectorize(
|
175 |
+
lambda x: x.encode('utf-8').decode('unicode-escape')
|
176 |
+
)(input_array)
|
177 |
+
decoded_output = np.vectorize(
|
178 |
+
lambda x: x.encode('utf-8').decode('unicode-escape')
|
179 |
+
)(output_array)
|
180 |
+
|
181 |
+
evaluations_df['input'] = decoded_input
|
182 |
+
evaluations_df['output'] = decoded_output
|
183 |
+
|
184 |
+
gb = GridOptionsBuilder.from_dataframe(evaluations_df)
|
185 |
+
|
186 |
+
gb.configure_column("type", header_name="App Type")
|
187 |
+
gb.configure_column("record_json", header_name="Record JSON", hide=True)
|
188 |
+
gb.configure_column("app_json", header_name="App JSON", hide=True)
|
189 |
+
gb.configure_column("cost_json", header_name="Cost JSON", hide=True)
|
190 |
+
gb.configure_column("perf_json", header_name="Perf. JSON", hide=True)
|
191 |
+
|
192 |
+
gb.configure_column("record_id", header_name="Record ID", hide=True)
|
193 |
+
gb.configure_column("app_id", header_name="App ID")
|
194 |
+
|
195 |
+
gb.configure_column("feedback_id", header_name="Feedback ID", hide=True)
|
196 |
+
gb.configure_column("input", header_name="User Input")
|
197 |
+
gb.configure_column(
|
198 |
+
"output",
|
199 |
+
header_name="Response",
|
200 |
+
)
|
201 |
+
gb.configure_column("total_tokens", header_name="Total Tokens (#)")
|
202 |
+
gb.configure_column("total_cost", header_name="Total Cost (USD)")
|
203 |
+
gb.configure_column("latency", header_name="Latency (Seconds)")
|
204 |
+
gb.configure_column("tags", header_name="Tags")
|
205 |
+
gb.configure_column("ts", header_name="Time Stamp", sort="desc")
|
206 |
+
|
207 |
+
non_feedback_cols = [
|
208 |
+
"app_id",
|
209 |
+
"type",
|
210 |
+
"ts",
|
211 |
+
"total_tokens",
|
212 |
+
"total_cost",
|
213 |
+
"record_json",
|
214 |
+
"latency",
|
215 |
+
"record_id",
|
216 |
+
"app_id",
|
217 |
+
"cost_json",
|
218 |
+
"app_json",
|
219 |
+
"input",
|
220 |
+
"output",
|
221 |
+
"perf_json",
|
222 |
+
]
|
223 |
+
|
224 |
+
for feedback_col in evaluations_df.columns.drop(non_feedback_cols):
|
225 |
+
if "distance" in feedback_col:
|
226 |
+
gb.configure_column(
|
227 |
+
feedback_col, hide=feedback_col.endswith("_calls")
|
228 |
+
)
|
229 |
+
else:
|
230 |
+
# cell highlight depending on feedback direction
|
231 |
+
cellstyle = JsCode(
|
232 |
+
cellstyle_jscode[feedback_directions.get(
|
233 |
+
feedback_col, default_direction
|
234 |
+
)]
|
235 |
+
)
|
236 |
+
|
237 |
+
gb.configure_column(
|
238 |
+
feedback_col,
|
239 |
+
cellStyle=cellstyle,
|
240 |
+
hide=feedback_col.endswith("_calls")
|
241 |
+
)
|
242 |
+
|
243 |
+
gb.configure_pagination()
|
244 |
+
gb.configure_side_bar()
|
245 |
+
gb.configure_selection(selection_mode="single", use_checkbox=False)
|
246 |
+
# gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc="sum", editable=True)
|
247 |
+
gridOptions = gb.build()
|
248 |
+
data = AgGrid(
|
249 |
+
evaluations_df,
|
250 |
+
gridOptions=gridOptions,
|
251 |
+
update_mode=GridUpdateMode.SELECTION_CHANGED,
|
252 |
+
allow_unsafe_jscode=True,
|
253 |
+
)
|
254 |
+
|
255 |
+
selected_rows = data["selected_rows"]
|
256 |
+
selected_rows = pd.DataFrame(selected_rows)
|
257 |
+
|
258 |
+
if len(selected_rows) == 0:
|
259 |
+
st.write("Hint: select a row to display details of a record")
|
260 |
+
|
261 |
+
else:
|
262 |
+
# Start the record specific section
|
263 |
+
st.divider()
|
264 |
+
|
265 |
+
# Breadcrumbs
|
266 |
+
st.caption(
|
267 |
+
f"{selected_rows['app_id'][0]} / {selected_rows['record_id'][0]}"
|
268 |
+
)
|
269 |
+
st.header(f"{selected_rows['record_id'][0]}")
|
270 |
+
|
271 |
+
render_record_metrics(app_df, selected_rows)
|
272 |
+
|
273 |
+
st.markdown("")
|
274 |
+
|
275 |
+
prompt = selected_rows["input"][0]
|
276 |
+
response = selected_rows["output"][0]
|
277 |
+
details = selected_rows["app_json"][0]
|
278 |
+
|
279 |
+
app_json = json.loads(
|
280 |
+
details
|
281 |
+
) # apps may not be deserializable, don't try to, keep it json.
|
282 |
+
|
283 |
+
row = selected_rows.head().iloc[0]
|
284 |
+
|
285 |
+
# Display input/response side by side. In each column, we put them in tabs mainly for
|
286 |
+
# formatting/styling purposes.
|
287 |
+
input_col, response_col = st.columns(2)
|
288 |
+
|
289 |
+
(input_tab,) = input_col.tabs(["Input"])
|
290 |
+
with input_tab:
|
291 |
+
with st.expander(
|
292 |
+
f"Input {render_selector_markdown(Select.RecordInput)}",
|
293 |
+
expanded=True):
|
294 |
+
write_or_json(st, obj=prompt)
|
295 |
+
|
296 |
+
(response_tab,) = response_col.tabs(["Response"])
|
297 |
+
with response_tab:
|
298 |
+
with st.expander(
|
299 |
+
f"Response {render_selector_markdown(Select.RecordOutput)}",
|
300 |
+
expanded=True):
|
301 |
+
write_or_json(st, obj=response)
|
302 |
+
|
303 |
+
feedback_tab, metadata_tab = st.tabs(["Feedback", "Metadata"])
|
304 |
+
|
305 |
+
with metadata_tab:
|
306 |
+
metadata = app_json.get("metadata")
|
307 |
+
if metadata:
|
308 |
+
with st.expander("Metadata"):
|
309 |
+
st.markdown(draw_metadata(metadata))
|
310 |
+
else:
|
311 |
+
st.write("No metadata found")
|
312 |
+
|
313 |
+
with feedback_tab:
|
314 |
+
if len(feedback_cols) == 0:
|
315 |
+
st.write("No feedback details")
|
316 |
+
|
317 |
+
for fcol in feedback_cols:
|
318 |
+
feedback_name = fcol
|
319 |
+
feedback_result = row[fcol]
|
320 |
+
print(feedback_result)
|
321 |
+
|
322 |
+
if MULTI_CALL_NAME_DELIMITER in fcol:
|
323 |
+
fcol = fcol.split(MULTI_CALL_NAME_DELIMITER)[0]
|
324 |
+
feedback_calls = row[f"{fcol}_calls"]
|
325 |
+
|
326 |
+
def display_feedback_call(call):
|
327 |
+
|
328 |
+
def highlight(s):
|
329 |
+
if "distance" in feedback_name:
|
330 |
+
return [
|
331 |
+
f"background-color: {CATEGORY.UNKNOWN.color}"
|
332 |
+
] * len(s)
|
333 |
+
cat = CATEGORY.of_score(
|
334 |
+
s.result,
|
335 |
+
higher_is_better=feedback_directions.get(
|
336 |
+
fcol, default_direction
|
337 |
+
) == default_direction
|
338 |
+
)
|
339 |
+
return [f"background-color: {cat.color}"] * len(s)
|
340 |
+
|
341 |
+
if call is not None and len(call) > 0:
|
342 |
+
df = pd.DataFrame.from_records(
|
343 |
+
[call[i]["args"] for i in range(len(call))]
|
344 |
+
)
|
345 |
+
df["result"] = pd.DataFrame(
|
346 |
+
[
|
347 |
+
float(call[i]["ret"])
|
348 |
+
if call[i]["ret"] is not None else -1
|
349 |
+
for i in range(len(call))
|
350 |
+
]
|
351 |
+
)
|
352 |
+
df["meta"] = pd.Series(
|
353 |
+
[call[i]["meta"] for i in range(len(call))]
|
354 |
+
)
|
355 |
+
df = df.join(df.meta.apply(lambda m: pd.Series(m))
|
356 |
+
).drop(columns="meta")
|
357 |
+
|
358 |
+
st.dataframe(
|
359 |
+
df.style.apply(highlight, axis=1).format(
|
360 |
+
"{:.2}", subset=["result"]
|
361 |
+
)
|
362 |
+
)
|
363 |
+
|
364 |
+
else:
|
365 |
+
st.text("No feedback details.")
|
366 |
+
|
367 |
+
with st.expander(f"{feedback_name} = {feedback_result}",
|
368 |
+
expanded=True):
|
369 |
+
display_feedback_call(feedback_calls)
|
370 |
+
|
371 |
+
record_str = selected_rows["record_json"][0]
|
372 |
+
record_json = json.loads(record_str)
|
373 |
+
record = Record.model_validate(record_json)
|
374 |
+
|
375 |
+
classes: Iterable[Tuple[Lens, ComponentView]
|
376 |
+
] = list(instrumented_component_views(app_json))
|
377 |
+
classes_map = {path: view for path, view in classes}
|
378 |
+
|
379 |
+
st.markdown("")
|
380 |
+
st.subheader("Timeline")
|
381 |
+
val = record_viewer(record_json, app_json)
|
382 |
+
st.markdown("")
|
383 |
+
|
384 |
+
match_query = None
|
385 |
+
|
386 |
+
# Assumes record_json['perf']['start_time'] is always present
|
387 |
+
if val != "":
|
388 |
+
match = None
|
389 |
+
for call in record.calls:
|
390 |
+
if call.perf.start_time.isoformat() == val:
|
391 |
+
match = call
|
392 |
+
break
|
393 |
+
|
394 |
+
if match:
|
395 |
+
length = len(match.stack)
|
396 |
+
app_call = match.stack[length - 1]
|
397 |
+
|
398 |
+
match_query = match.top().path
|
399 |
+
|
400 |
+
st.subheader(
|
401 |
+
f"{app_call.method.obj.cls.name} {render_selector_markdown(Select.for_app(match_query))}"
|
402 |
+
)
|
403 |
+
|
404 |
+
draw_call(match)
|
405 |
+
|
406 |
+
view = classes_map.get(match_query)
|
407 |
+
if view is not None:
|
408 |
+
render_component(
|
409 |
+
query=match_query, component=view, header=False
|
410 |
+
)
|
411 |
+
else:
|
412 |
+
st.write(
|
413 |
+
f"Call by `{match_query}` was not associated with any instrumented"
|
414 |
+
" component."
|
415 |
+
)
|
416 |
+
# Look up whether there was any data at that path even if not an instrumented component:
|
417 |
+
|
418 |
+
try:
|
419 |
+
app_component_json = list(
|
420 |
+
match_query.get(app_json)
|
421 |
+
)[0]
|
422 |
+
if app_component_json is not None:
|
423 |
+
with st.expander(
|
424 |
+
"Uninstrumented app component details."
|
425 |
+
):
|
426 |
+
st.json(app_component_json)
|
427 |
+
except Exception:
|
428 |
+
st.write(
|
429 |
+
f"Recorded invocation by component `{match_query}` but cannot find this component in the app json."
|
430 |
+
)
|
431 |
+
|
432 |
+
else:
|
433 |
+
st.text("No match found")
|
434 |
+
else:
|
435 |
+
st.subheader(f"App {render_selector_markdown(Select.App)}")
|
436 |
+
with st.expander("App Details:"):
|
437 |
+
st.json(jsonify_for_ui(app_json))
|
438 |
+
|
439 |
+
if match_query is not None:
|
440 |
+
container = st.empty()
|
441 |
+
|
442 |
+
has_subcomponents = False
|
443 |
+
for query, component in classes:
|
444 |
+
if not match_query.is_immediate_prefix_of(query):
|
445 |
+
continue
|
446 |
+
|
447 |
+
if len(query.path) == 0:
|
448 |
+
# Skip App, will still list App.app under "app".
|
449 |
+
continue
|
450 |
+
|
451 |
+
has_subcomponents = True
|
452 |
+
render_component(query, component)
|
453 |
+
|
454 |
+
if has_subcomponents:
|
455 |
+
container.markdown("#### Subcomponents:")
|
456 |
+
|
457 |
+
st.header("More options:")
|
458 |
+
|
459 |
+
if st.button("Display full app json"):
|
460 |
+
st.write(jsonify_for_ui(app_json))
|
461 |
+
|
462 |
+
if st.button("Display full record json"):
|
463 |
+
st.write(jsonify_for_ui(record_json))
|
464 |
+
|
465 |
+
with tab2:
|
466 |
+
feedback = feedback_cols
|
467 |
+
cols = 4
|
468 |
+
rows = len(feedback) // cols + 1
|
469 |
+
|
470 |
+
for row_num in range(rows):
|
471 |
+
with st.container():
|
472 |
+
columns = st.columns(cols)
|
473 |
+
for col_num in range(cols):
|
474 |
+
with columns[col_num]:
|
475 |
+
ind = row_num * cols + col_num
|
476 |
+
if ind < len(feedback):
|
477 |
+
# Generate histogram
|
478 |
+
fig, ax = plt.subplots()
|
479 |
+
bins = [
|
480 |
+
0, 0.2, 0.4, 0.6, 0.8, 1.0
|
481 |
+
] # Quintile buckets
|
482 |
+
ax.hist(
|
483 |
+
app_df[feedback[ind]],
|
484 |
+
bins=bins,
|
485 |
+
edgecolor="black",
|
486 |
+
color="#2D736D"
|
487 |
+
)
|
488 |
+
ax.set_xlabel("Feedback Value")
|
489 |
+
ax.set_ylabel("Frequency")
|
490 |
+
ax.set_title(feedback[ind], loc="center")
|
491 |
+
st.pyplot(fig)
|
pages/3_app.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
|
4 |
+
try:
|
5 |
+
raw_docs_files = ", ".join(os.listdir("./raw_documents"))
|
6 |
+
curr_directory_files = ", ".join(os.listdir("."))
|
7 |
+
file_ls_str = raw_docs_files + "\n\n" + curr_directory_files
|
8 |
+
except:
|
9 |
+
file_ls_str = "NA"
|
10 |
+
|
11 |
+
st.write(f"Hello World! File list: {file_ls_str}")
|
streamlit_app.py
CHANGED
@@ -22,8 +22,11 @@ evaluation_path = pkg_resources.resource_filename(
|
|
22 |
ux_path = pkg_resources.resource_filename(
|
23 |
"trulens_eval", "ux"
|
24 |
)
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
27 |
if os.path.exists("./ux"):
|
28 |
shutil.rmtree("./ux")
|
29 |
shutil.copytree(ux_path, "./ux")
|
|
|
22 |
ux_path = pkg_resources.resource_filename(
|
23 |
"trulens_eval", "ux"
|
24 |
)
|
25 |
+
|
26 |
+
os.makedirs("./pages", exist_ok=True)
|
27 |
+
shutil.copyfile(leaderboard_path, os.path.join("./pages", "1_Leaderboard.py"))
|
28 |
+
shutil.copyfile(evaluation_path, os.path.join("./pages", "2_Evaluations.py"))
|
29 |
+
|
30 |
if os.path.exists("./ux"):
|
31 |
shutil.rmtree("./ux")
|
32 |
shutil.copytree(ux_path, "./ux")
|