import streamlit as st
from datasets import load_dataset
st.set_page_config(
page_icon="🧊",
layout="wide",
)
st.write(
"This is an application for viewing different generations for the same prompt. The generations vary depending on the checkpoint used and also the parameters used for the generation."
)
HF_API_TOKEN = st.secrets["HF_API_TOKEN"]
PROMPT_COLOR = "#CA437E"
def safe_text(text):
text = text.replace("\n", "
")
return f"
{text}
"
def prompt_markup_format(text):
return f'<*font color="black">{text}*font>'
def generation_markup_format(text):
return f"{text}"
ds = load_dataset("bigscience/bloom-generations", use_auth_token=HF_API_TOKEN)
ds = ds["train"]
possible_langs = ds.unique("lang")
col_1, col_2 = st.columns(2)
with col_1:
st.markdown("Prompt
", unsafe_allow_html=True)
chosen_lang = st.selectbox("Choose a lang", possible_langs + ["all"])
if chosen_lang == "all":
ds_lang = ds
else:
ds_lang = ds.filter(
lambda exs: [lang == chosen_lang for lang in exs["lang"]], batched=True
)
possible_prompts = ds_lang.unique("prompt")
chosen_prompt = st.selectbox("Choose a prompt", possible_prompts)
st.markdown(safe_text(chosen_prompt), unsafe_allow_html=True)
sub_ds = ds_lang.filter(
lambda exs: [prompt == chosen_prompt for prompt in exs["prompt"]], batched=True
)
with col_2:
st.markdown(
"Generation
", unsafe_allow_html=True
)
index_sample = st.number_input(
"Index of the chosen generation",
min_value=0,
max_value=len(sub_ds) - 1,
value=0,
step=1,
)
sample = sub_ds[index_sample]
generation = sample["generation"]
stop_index_sample = st.number_input(
"Stop generation at character number",
min_value=0,
max_value=len(generation),
value=len(generation),
step=1,
)
markdown_text = generation_markup_format(safe_text(generation[:stop_index_sample]))
st.markdown(markdown_text, unsafe_allow_html=True)
st.markdown(
"Generation configuration
",
unsafe_allow_html=True,
)
config = {
key: value
for key, value in sample.items()
if key not in ["prompt", "generation"]
}
config