loubnabnl HF staff commited on
Commit
b19b634
·
verified ·
1 Parent(s): bbfd188

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -18
app.py CHANGED
@@ -4,26 +4,51 @@ import os
4
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
- st.set_page_config(page_title="Synthetic textbooks inspection", layout="wide")
8
- st.title("Synthetic textbooks inspection")
9
- st.markdown("Inspection of synthetic textbooks generated by `Falcon-180B-chat`")
10
-
11
- @st.cache_data()
12
- def load_data(source="all"):
13
- ds = load_dataset("HuggingFaceTB/synthetic_textbooks_subset", split="train", use_auth_token=HF_TOKEN)
14
- if source != "all":
15
- ds = ds.filter(lambda x: x["source"] == source)
 
 
 
 
 
 
 
 
 
16
  return ds
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- source = st.selectbox("Data source", ['all', 'wikihow','khan_academy', 'stanford_cources', 'rw_wikihow', 'rw_stanford'])
20
- samples = load_data(source)
21
- n_samples = len(samples)
22
 
23
- index_example = st.number_input(f"Index of the sample (out of {n_samples}):", min_value=0, max_value=n_samples-1, value=0, step=1)
24
- st.markdown(f"Example belongs to source: {samples[index_example]['source']}")
25
- st.subheader("Prompt")
26
- st.markdown(samples[index_example]["prompt"])
27
 
28
- st.subheader("Textbook")
29
- st.markdown(samples[index_example]['textbook'])
 
4
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
+ st.set_page_config(page_title="FW Clusters inspection", layout="wide")
8
+ st.title("FW clusters inspection")
9
+
10
+ st.markdown("""
11
+ We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
12
+
13
+ Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material.
14
+
15
+ Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/).
16
+
17
+ Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics seperately. Hence the `Select Category Type` dropdown in our interface.
18
+ """)
19
+
20
+ @st.cache_data
21
+ def load_data(educational_topic):
22
+ ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2)
23
+ if educational_topic in ['Yes', 'No']:
24
+ ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic)
25
  return ds
26
 
27
+ @st.cache_data
28
+ def get_categories_by_type(_ds, category_type):
29
+ filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type)
30
+ return list(set(filtered_ds['category']))
31
+
32
+
33
+ st.subheader("Cluster information")
34
+ col_1, col_2, col_3 = st.columns(3)
35
+ with col_1:
36
+ educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"])
37
+
38
+ ds = load_data(educational_topic)
39
+
40
+ with col_2:
41
+ category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik']
42
+ selected_category_type = st.selectbox("Select Category Type", category_types)
43
+ with col_3:
44
+ categories = get_categories_by_type(ds, selected_category_type)
45
+ selected_category = st.selectbox("Select Category", categories)
46
 
47
+ selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
 
 
48
 
49
+ # Select sample index
50
+ n_samples = len(selected_cluster["examples"])
51
+ index_example = st.number_input(f"Index of a sample: 0 - {n_samples}", min_value=0, max_value=n_samples-1, value=0, step=1)
 
52
 
53
+ sample = selected_cluster["examples"][index_example]
54
+ st.markdown(sample)