Spaces:

bigscience
/

BigScienceCorpus

Runtime error

App Files Files Community

Yacine Jernite commited on Mar 30, 2022

Commit

5d377d2

1 Parent(s): 208a5e1

app

Browse files

Files changed (2) hide show

app.py +79 -0
resources/sources_with_info_cards.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import json
+import streamlit as st
+st.set_page_config(
+    page_title="BigScience Training Corpus",
+    page_icon="https://avatars.githubusercontent.com/u/82455566",
+    layout="wide",
+    initial_sidebar_state="auto",
+)
+query_params = st.experimental_get_query_params()
+@st.cache()
+def load_catalogue():
+    full_catalogue = dict(
+        [
+            (source_name, source)
+            for source_name, source in json.load(
+                open("resources/sources_with_info_cards.json")
+            )
+            if source_name != "aggregated"
+        ]
+    )
+    language_catalogues = {
+        "all": full_catalogue,
+    }
+    for source_name, source in full_catalogue.items():
+        for ln_dct in source["languages"]:
+            ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"]
+            language_catalogues[ln_code] = language_catalogues.get(ln_code, {})
+            language_catalogues[ln_code][source_name] = source
+    for ln in language_catalogues:
+        if ln != "all":
+            language_catalogues[ln] = dict(
+                sorted(
+                    language_catalogues[ln].items(),
+                    key=lambda x: [
+                        ln_dct["size"]
+                        for ln_dct in x[1]["languages"]
+                        if ln_dct["ln_code"] == ln
+                    ][0],
+                    reverse=True,
+                )
+            )
+    return dict(sorted(language_catalogues.items()))
+catalogue_by_ln = load_catalogue()
+with st.sidebar:
+    ln_select = st.selectbox(
+        "Show source for language:",
+        catalogue_by_ln,
+    )
+    source_select = st.selectbox(
+        "Show information for source:",
+        catalogue_by_ln[ln_select],
+        index=list(catalogue_by_ln[ln_select]).index(
+            query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0]
+        ),
+    )
+    st.experimental_set_query_params(**{"source": source_select})
+with st.expander(f"Dataset Card for {source_select}", expanded=True):
+    st.markdown(catalogue_by_ln["all"][source_select]["data_card"])
+if "catalogue_info" in catalogue_by_ln["all"][source_select]:
+    with st.expander(f"Catalogue Information for {source_select}"):
+        st.write(catalogue_by_ln["all"][source_select]["catalogue_info"])
+if "seed_info" in catalogue_by_ln["all"][source_select]:
+    with st.expander(f"Pseudocrawl Seed Information for {source_select}"):
+        st.write(catalogue_by_ln["all"][source_select]["seed_info"])
+if "hf_info" in catalogue_by_ln["all"][source_select]:
+    with st.expander(f"HF Dataset Information for {source_select}"):
+        st.write(catalogue_by_ln["all"][source_select]["hf_info"])

resources/sources_with_info_cards.json CHANGED Viewed

The diff for this file is too large to render. See raw diff