Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,64 +4,22 @@ import os
|
|
4 |
|
5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
|
7 |
-
st.set_page_config(page_title="
|
8 |
-
st.title("
|
9 |
-
|
10 |
-
st.markdown("""
|
11 |
-
We clustered 100k web samples using [text-clustering](https://github.com/huggingface/text-clustering).
|
12 |
-
|
13 |
-
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. \
|
14 |
-
Technically, we provide it with 10 random examples from the cluster in the prompt and ask it to judge their topics.
|
15 |
-
|
16 |
-
Additionally, the model was tasked with finding the topic of each cluster (based on the 10 random examples).
|
17 |
-
""")
|
18 |
|
19 |
|
20 |
@st.cache_data
|
21 |
-
def load_data(min_score=
|
22 |
-
|
23 |
-
ds =
|
24 |
-
def filter_func(x):
|
25 |
-
try:
|
26 |
-
score = int(x['educational_score'])
|
27 |
-
value = False if show_special else min_score <= score <= max_score
|
28 |
-
return value
|
29 |
-
except (ValueError, TypeError):
|
30 |
-
# Return True if show_special is checked and educational_score is None or ''
|
31 |
-
return show_special
|
32 |
-
|
33 |
-
ds = ds.filter(filter_func)
|
34 |
return ds
|
35 |
|
36 |
-
st.subheader("Cluster information")
|
37 |
-
col_1, col_2, col_3 = st.columns(3)
|
38 |
-
with col_1:
|
39 |
-
show_special = st.checkbox('Show only clusters with undefined educational score', False)
|
40 |
-
with col_2:
|
41 |
-
min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
|
42 |
-
with col_3:
|
43 |
-
max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
|
44 |
-
|
45 |
-
# Load data based on slider values and checkbox status
|
46 |
-
ds = load_data(min_value, max_value, show_special)
|
47 |
-
categories = list(set(ds["category"]))
|
48 |
-
selected_category = st.selectbox("Select a topic", categories)
|
49 |
-
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
|
50 |
-
|
51 |
-
# Select sample index
|
52 |
-
n_samples = len(selected_cluster)
|
53 |
-
if n_samples > 0:
|
54 |
-
col_1, col_2 = st.columns(2)
|
55 |
-
with col_1:
|
56 |
-
index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)
|
57 |
|
58 |
-
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
st.markdown(f"**Educational score of the cluster**: {selected_cluster[index_cluster]['educational_score']}")
|
65 |
-
st.markdown(sample)
|
66 |
-
else:
|
67 |
-
st.markdown("No files found, change the cluster.")
|
|
|
4 |
|
5 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
|
7 |
+
st.set_page_config(page_title="SelfCheck", layout="wide")
|
8 |
+
st.title("SelfCheck scores")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
@st.cache_data
|
12 |
+
def load_data(min_score=0.4):
|
13 |
+
ds = load_dataset("HuggingFaceTB/hallucinations_450_samples_scores", split="train", token=HF_TOKEN, num_proc=2)
|
14 |
+
ds = ds.filter(lambda x: x["passage_score"] >= min_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
return ds
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
min_value = st.slider('Select minimum selfcheck score', 0.0, 1.0, 0.1, key='min_score')
|
19 |
+
ds = load_data(min_score=0.2)
|
20 |
+
index = st.number_input(f'Found {len(ds)} samples, choose one', min_value=0, max_value=len(ds)-1, value=0, step=1)
|
21 |
|
22 |
+
# Load data based on slider values and checkbox status
|
23 |
+
sample = ds[index]
|
24 |
+
st.markdown(f"**Passage Score:** {sample['passage_score']}, seed data: {sample['seed_data']}, format: {sample['format']}.")
|
25 |
+
st.markdown(sample['original_text']}")
|
|
|
|
|
|
|
|