loubnabnl HF staff commited on
Commit
042d897
·
verified ·
1 Parent(s): e8b851a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -54
app.py CHANGED
@@ -4,64 +4,22 @@ import os
4
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
- st.set_page_config(page_title="Web Clusters inspection", layout="wide")
8
- st.title("Web clusters inspection")
9
-
10
- st.markdown("""
11
- We clustered 100k web samples using [text-clustering](https://github.com/huggingface/text-clustering).
12
-
13
- Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. \
14
- Technically, we provide it with 10 random examples from the cluster in the prompt and ask it to judge their topics.
15
-
16
- Additionally, the model was tasked with finding the topic of each cluster (based on the 10 random examples).
17
- """)
18
 
19
 
20
  @st.cache_data
21
- def load_data(min_score=1, max_score=10, show_special=False):
22
- # HuggingFaceTB/FW_clusters_free_topics
23
- ds = load_dataset("HuggingFaceTB/FW_clusters_100k_145_topics", split="train", token=HF_TOKEN, num_proc=2)
24
- def filter_func(x):
25
- try:
26
- score = int(x['educational_score'])
27
- value = False if show_special else min_score <= score <= max_score
28
- return value
29
- except (ValueError, TypeError):
30
- # Return True if show_special is checked and educational_score is None or ''
31
- return show_special
32
-
33
- ds = ds.filter(filter_func)
34
  return ds
35
 
36
- st.subheader("Cluster information")
37
- col_1, col_2, col_3 = st.columns(3)
38
- with col_1:
39
- show_special = st.checkbox('Show only clusters with undefined educational score', False)
40
- with col_2:
41
- min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
42
- with col_3:
43
- max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
44
-
45
- # Load data based on slider values and checkbox status
46
- ds = load_data(min_value, max_value, show_special)
47
- categories = list(set(ds["category"]))
48
- selected_category = st.selectbox("Select a topic", categories)
49
- selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
50
-
51
- # Select sample index
52
- n_samples = len(selected_cluster)
53
- if n_samples > 0:
54
- col_1, col_2 = st.columns(2)
55
- with col_1:
56
- index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)
57
 
58
- files = selected_cluster[index_cluster]["examples"]
 
 
59
 
60
- with col_2:
61
- index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1)
62
-
63
- sample = files[index_example]
64
- st.markdown(f"**Educational score of the cluster**: {selected_cluster[index_cluster]['educational_score']}")
65
- st.markdown(sample)
66
- else:
67
- st.markdown("No files found, change the cluster.")
 
4
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
+ st.set_page_config(page_title="SelfCheck", layout="wide")
8
+ st.title("SelfCheck scores")
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  @st.cache_data
12
+ def load_data(min_score=0.4):
13
+ ds = load_dataset("HuggingFaceTB/hallucinations_450_samples_scores", split="train", token=HF_TOKEN, num_proc=2)
14
+ ds = ds.filter(lambda x: x["passage_score"] >= min_score)
 
 
 
 
 
 
 
 
 
 
15
  return ds
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ min_value = st.slider('Select minimum selfcheck score', 0.0, 1.0, 0.1, key='min_score')
19
+ ds = load_data(min_score=0.2)
20
+ index = st.number_input(f'Found {len(ds)} samples, choose one', min_value=0, max_value=len(ds)-1, value=0, step=1)
21
 
22
+ # Load data based on slider values and checkbox status
23
+ sample = ds[index]
24
+ st.markdown(f"**Passage Score:** {sample['passage_score']}, seed data: {sample['seed_data']}, format: {sample['format']}.")
25
+ st.markdown(sample['original_text']}")