Spaces:
Runtime error
Runtime error
Yacine Jernite
commited on
Commit
·
5d377d2
1
Parent(s):
208a5e1
app
Browse files- app.py +79 -0
- resources/sources_with_info_cards.json +0 -0
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
st.set_page_config(
|
6 |
+
page_title="BigScience Training Corpus",
|
7 |
+
page_icon="https://avatars.githubusercontent.com/u/82455566",
|
8 |
+
layout="wide",
|
9 |
+
initial_sidebar_state="auto",
|
10 |
+
)
|
11 |
+
|
12 |
+
query_params = st.experimental_get_query_params()
|
13 |
+
|
14 |
+
|
15 |
+
@st.cache()
|
16 |
+
def load_catalogue():
|
17 |
+
full_catalogue = dict(
|
18 |
+
[
|
19 |
+
(source_name, source)
|
20 |
+
for source_name, source in json.load(
|
21 |
+
open("resources/sources_with_info_cards.json")
|
22 |
+
)
|
23 |
+
if source_name != "aggregated"
|
24 |
+
]
|
25 |
+
)
|
26 |
+
language_catalogues = {
|
27 |
+
"all": full_catalogue,
|
28 |
+
}
|
29 |
+
for source_name, source in full_catalogue.items():
|
30 |
+
for ln_dct in source["languages"]:
|
31 |
+
ln_code = "zh" if ln_dct["ln_code"].startswith("zh") else ln_dct["ln_code"]
|
32 |
+
language_catalogues[ln_code] = language_catalogues.get(ln_code, {})
|
33 |
+
language_catalogues[ln_code][source_name] = source
|
34 |
+
for ln in language_catalogues:
|
35 |
+
if ln != "all":
|
36 |
+
language_catalogues[ln] = dict(
|
37 |
+
sorted(
|
38 |
+
language_catalogues[ln].items(),
|
39 |
+
key=lambda x: [
|
40 |
+
ln_dct["size"]
|
41 |
+
for ln_dct in x[1]["languages"]
|
42 |
+
if ln_dct["ln_code"] == ln
|
43 |
+
][0],
|
44 |
+
reverse=True,
|
45 |
+
)
|
46 |
+
)
|
47 |
+
return dict(sorted(language_catalogues.items()))
|
48 |
+
|
49 |
+
|
50 |
+
catalogue_by_ln = load_catalogue()
|
51 |
+
|
52 |
+
with st.sidebar:
|
53 |
+
ln_select = st.selectbox(
|
54 |
+
"Show source for language:",
|
55 |
+
catalogue_by_ln,
|
56 |
+
)
|
57 |
+
source_select = st.selectbox(
|
58 |
+
"Show information for source:",
|
59 |
+
catalogue_by_ln[ln_select],
|
60 |
+
index=list(catalogue_by_ln[ln_select]).index(
|
61 |
+
query_params.get("source", [list(catalogue_by_ln[ln_select].keys())[0]])[0]
|
62 |
+
),
|
63 |
+
)
|
64 |
+
st.experimental_set_query_params(**{"source": source_select})
|
65 |
+
|
66 |
+
with st.expander(f"Dataset Card for {source_select}", expanded=True):
|
67 |
+
st.markdown(catalogue_by_ln["all"][source_select]["data_card"])
|
68 |
+
|
69 |
+
if "catalogue_info" in catalogue_by_ln["all"][source_select]:
|
70 |
+
with st.expander(f"Catalogue Information for {source_select}"):
|
71 |
+
st.write(catalogue_by_ln["all"][source_select]["catalogue_info"])
|
72 |
+
|
73 |
+
if "seed_info" in catalogue_by_ln["all"][source_select]:
|
74 |
+
with st.expander(f"Pseudocrawl Seed Information for {source_select}"):
|
75 |
+
st.write(catalogue_by_ln["all"][source_select]["seed_info"])
|
76 |
+
|
77 |
+
if "hf_info" in catalogue_by_ln["all"][source_select]:
|
78 |
+
with st.expander(f"HF Dataset Information for {source_select}"):
|
79 |
+
st.write(catalogue_by_ln["all"][source_select]["hf_info"])
|
resources/sources_with_info_cards.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|