Spaces:
Runtime error
Runtime error
File size: 2,728 Bytes
5d377d2 4fc723d 5d377d2 f672036 5d377d2 cf5b1f9 5d377d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import json
import streamlit as st
st.set_page_config(
page_title="BigScience Training Corpus",
page_icon="https://avatars.githubusercontent.com/u/82455566",
layout="wide",
initial_sidebar_state="auto",
)
query_params = st.experimental_get_query_params()
@st.cache()
def load_catalogue():
full_catalogue = dict(
[
(source_name, source)
for source_name, source in json.load(
open("resources/sources_with_info_cards.json")
)
if source_name != "aggregated"
]
)
language_catalogues = {
"all": full_catalogue,
}
for source_name, source in full_catalogue.items():
for ln_dct in source["languages"]:
ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"]
language_catalogues[ln_code] = language_catalogues.get(ln_code, {})
language_catalogues[ln_code][source_name] = source
for ln in language_catalogues:
if ln != "all":
language_catalogues[ln] = dict(
sorted(
language_catalogues[ln].items(),
key=lambda x: [
ln_dct["size"]
for ln_dct in x[1]["languages"]
if ln_dct["ln_code"] == ln
][0],
reverse=True,
)
)
return dict(sorted(language_catalogues.items()))
catalogue_by_ln = load_catalogue()
with st.sidebar:
ln_select = st.selectbox(
"Show source list for language:",
catalogue_by_ln,
)
source_select = st.selectbox(
"Show information for source:",
catalogue_by_ln[ln_select],
index=list(catalogue_by_ln[ln_select]).index(
query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0]
) if ln_select == "all" else 0,
)
st.experimental_set_query_params(**{"source": source_select})
with st.expander(f"Dataset Card for {source_select}", expanded=True):
st.markdown(catalogue_by_ln["all"][source_select]["data_card"])
if "catalogue_info" in catalogue_by_ln["all"][source_select]:
with st.expander(f"Catalogue Information for {source_select}"):
st.write(catalogue_by_ln["all"][source_select]["catalogue_info"])
if "seed_info" in catalogue_by_ln["all"][source_select]:
with st.expander(f"Pseudocrawl Seed Information for {source_select}"):
st.write(catalogue_by_ln["all"][source_select]["seed_info"])
if "hf_info" in catalogue_by_ln["all"][source_select]:
with st.expander(f"HF Dataset Information for {source_select}"):
st.write(catalogue_by_ln["all"][source_select]["hf_info"])
|