Spaces:
Sleeping
Sleeping
osanseviero
commited on
Commit
·
6c21ae3
1
Parent(s):
33c8677
Add languages
Browse files- changelog.md +7 -0
- language.py +0 -0
- models.py +126 -105
- utils.py +69 -0
changelog.md
CHANGED
@@ -1,5 +1,12 @@
|
|
1 |
Changelog
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
v0.1
|
4 |
- Allow pick comparison version
|
5 |
- Show delta in all metrics
|
|
|
1 |
Changelog
|
2 |
|
3 |
+
v0.2 - Oct 24
|
4 |
+
- Languages
|
5 |
+
- Allow filtering for modality
|
6 |
+
- Show new languages for the diff
|
7 |
+
- Show rate of change in languages
|
8 |
+
- Also include multilingual tag as multilingual for model selection in languages
|
9 |
+
|
10 |
v0.1
|
11 |
- Allow pick comparison version
|
12 |
- Show delta in all metrics
|
language.py
ADDED
File without changes
|
models.py
CHANGED
@@ -1,89 +1,62 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
from datasets import load_dataset
|
4 |
from ast import literal_eval
|
5 |
import altair as alt
|
6 |
-
import plotly.graph_objs as go
|
7 |
import matplotlib.pyplot as plt
|
8 |
|
9 |
-
|
10 |
-
print("Build")
|
11 |
-
nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
|
12 |
-
"translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
|
13 |
-
]
|
14 |
-
audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
|
15 |
-
cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
|
16 |
-
multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
|
17 |
-
tabular = ["tabular-classification", "tabular-regression"]
|
18 |
-
|
19 |
-
modalities = {
|
20 |
-
"nlp": nlp_tasks,
|
21 |
-
"audio": audio_tasks,
|
22 |
-
"cv": cv_tasks,
|
23 |
-
"multimodal": multimodal,
|
24 |
-
"tabular": tabular,
|
25 |
-
"rl": ["reinforcement-learning"]
|
26 |
-
}
|
27 |
-
|
28 |
-
def modality(row):
|
29 |
-
pipeline = row["pipeline"]
|
30 |
-
for modality, tasks in modalities.items():
|
31 |
-
if pipeline in tasks:
|
32 |
-
return modality
|
33 |
-
if type(pipeline) == "str":
|
34 |
-
return "unk_modality"
|
35 |
-
return None
|
36 |
|
|
|
|
|
37 |
supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
|
38 |
-
|
39 |
-
st.cache(allow_output_mutation=True)
|
40 |
-
def process_dataset(version):
|
41 |
-
# Load dataset at specified revision
|
42 |
-
dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
|
43 |
-
|
44 |
-
# Convert to pandas dataframe
|
45 |
-
data = dataset["train"].to_pandas()
|
46 |
-
|
47 |
-
# Add modality column
|
48 |
-
data["modality"] = data.apply(modality, axis=1)
|
49 |
-
|
50 |
-
# Bin the model card length into some bins
|
51 |
-
data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
|
52 |
-
|
53 |
-
return data
|
54 |
-
|
55 |
-
col1, col2 = st.columns(2)
|
56 |
with col1:
|
|
|
|
|
|
|
|
|
|
|
57 |
base = st.selectbox(
|
58 |
'Old revision',
|
59 |
supported_revisions,
|
60 |
index=1)
|
61 |
-
with
|
62 |
-
|
63 |
-
'
|
64 |
supported_revisions,
|
65 |
-
index=
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
old_data = process_dataset(base)
|
68 |
data = process_dataset(new)
|
69 |
-
|
70 |
-
def eval_tags(row):
|
71 |
-
tags = row["tags"]
|
72 |
-
if tags == "none" or tags == [] or tags == "{}":
|
73 |
-
return []
|
74 |
-
if tags[0] != "[":
|
75 |
-
tags = str([tags])
|
76 |
-
val = literal_eval(tags)
|
77 |
-
if isinstance(val, dict):
|
78 |
-
return []
|
79 |
-
return val
|
80 |
-
|
81 |
old_data["tags"] = old_data.apply(eval_tags, axis=1)
|
82 |
data["tags"] = data.apply(eval_tags, axis=1)
|
83 |
|
|
|
|
|
84 |
total_samples_old = old_data.shape[0]
|
85 |
total_samples = data.shape[0]
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
# Tabs don't work in Spaces st version
|
89 |
#tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
|
@@ -92,20 +65,9 @@ def main():
|
|
92 |
'Topic of interest',
|
93 |
["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
|
94 |
|
95 |
-
# with tab1:
|
96 |
if tab == "Language":
|
97 |
st.header("Languages info")
|
98 |
|
99 |
-
data.loc[data.languages == "False", 'languages'] = None
|
100 |
-
data.loc[data.languages == {}, 'languages'] = None
|
101 |
-
old_data.loc[old_data.languages == "False", 'languages'] = None
|
102 |
-
old_data.loc[old_data.languages == {}, 'languages'] = None
|
103 |
-
|
104 |
-
no_lang_count = data["languages"].isna().sum()
|
105 |
-
no_lang_count_old = old_data["languages"].isna().sum()
|
106 |
-
data["languages"] = data["languages"].fillna('none')
|
107 |
-
old_data["languages"] = old_data["languages"].fillna('none')
|
108 |
-
|
109 |
def make_list(row):
|
110 |
languages = row["languages"]
|
111 |
if languages == "none":
|
@@ -113,34 +75,86 @@ def main():
|
|
113 |
return literal_eval(languages)
|
114 |
|
115 |
def language_count(row):
|
116 |
-
|
117 |
-
leng = len(languages)
|
118 |
-
return leng
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
langs = langs[langs != {}]
|
128 |
-
total_langs = len(langs.unique())
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
134 |
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
with col1:
|
137 |
-
v = total_samples-no_lang_count
|
138 |
-
v_old = total_samples_old-no_lang_count_old
|
139 |
st.metric(label="Language Specified", value=v, delta=int(v-v_old))
|
140 |
with col2:
|
|
|
|
|
|
|
|
|
|
|
141 |
st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
|
142 |
-
with
|
|
|
|
|
|
|
|
|
|
|
143 |
st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
|
|
|
|
|
|
|
|
|
144 |
|
145 |
st.subheader("Count of languages per model repo")
|
146 |
st.text("Some repos are for multiple languages, so the count is greater than 1")
|
@@ -148,16 +162,21 @@ def main():
|
|
148 |
'All or just Multilingual',
|
149 |
["All", "Just Multilingual", "Three or more languages"])
|
150 |
|
151 |
-
filter = 0
|
152 |
-
st.text("Tofix: This just takes into account count of languages, it misses the multilingual tag")
|
153 |
-
if linguality == "Just Multilingual":
|
154 |
-
filter = 1
|
155 |
-
elif linguality == "Three or more languages":
|
156 |
-
filter = 2
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
df1 = models_with_langs['language_count'].value_counts()
|
160 |
-
models_with_langs_old = old_data[old_data["language_count"] > filter]
|
161 |
df1_old = models_with_langs_old['language_count'].value_counts()
|
162 |
st.bar_chart(df1)
|
163 |
|
@@ -174,13 +193,13 @@ def main():
|
|
174 |
else:
|
175 |
filter = 2
|
176 |
|
177 |
-
models_with_langs =
|
178 |
langs = models_with_langs["languages"].explode()
|
179 |
langs = langs[langs != {}]
|
180 |
orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
181 |
d = orig_d
|
182 |
|
183 |
-
models_with_langs_old =
|
184 |
langs = models_with_langs_old["languages"].explode()
|
185 |
langs = langs[langs != {}]
|
186 |
orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
@@ -212,6 +231,8 @@ def main():
|
|
212 |
final_data = pd.merge(
|
213 |
d, orig_d_old, how="outer", on="language"
|
214 |
)
|
|
|
|
|
215 |
final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
|
216 |
|
217 |
st.dataframe(final_data)
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
from ast import literal_eval
|
4 |
import altair as alt
|
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
|
7 |
+
from utils import process_dataset, eval_tags
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def main():
|
10 |
+
# Pick revision at top
|
11 |
supported_revisions = ["24_10_22", "17_10_22", "10_10_22", "27_09_22"]
|
12 |
+
col1, col2, col3 = st.columns(3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
with col1:
|
14 |
+
new = st.selectbox(
|
15 |
+
'Last revision',
|
16 |
+
supported_revisions,
|
17 |
+
index=0)
|
18 |
+
with col2:
|
19 |
base = st.selectbox(
|
20 |
'Old revision',
|
21 |
supported_revisions,
|
22 |
index=1)
|
23 |
+
with col3:
|
24 |
+
base_old = st.selectbox(
|
25 |
+
'Very old revision',
|
26 |
supported_revisions,
|
27 |
+
index=2)
|
28 |
+
|
29 |
+
def change_pct(old, new):
|
30 |
+
return round(100* (new - old) / new, 3)
|
31 |
+
|
32 |
+
def change_and_delta(old_old, old, new):
|
33 |
+
curr_change = change_pct(old, new)
|
34 |
+
prev_change = change_pct(old_old, old)
|
35 |
+
delta = f"{curr_change-prev_change}%"
|
36 |
+
curr_change = f"{curr_change}%"
|
37 |
+
return curr_change, delta
|
38 |
+
|
39 |
+
# Process dataset
|
40 |
+
old_old_data = process_dataset(base_old)
|
41 |
old_data = process_dataset(base)
|
42 |
data = process_dataset(new)
|
43 |
+
old_old_data["tags"] = old_old_data.apply(eval_tags, axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
old_data["tags"] = old_data.apply(eval_tags, axis=1)
|
45 |
data["tags"] = data.apply(eval_tags, axis=1)
|
46 |
|
47 |
+
# High level count of models and rate of change
|
48 |
+
total_samples_old_old = old_old_data.shape[0]
|
49 |
total_samples_old = old_data.shape[0]
|
50 |
total_samples = data.shape[0]
|
51 |
+
|
52 |
+
curr_change, delta = change_and_delta(total_samples_old_old, total_samples_old, total_samples)
|
53 |
+
|
54 |
+
col1, col2 = st.columns(2)
|
55 |
+
with col1:
|
56 |
+
st.metric(label="Total models", value=total_samples, delta=total_samples-total_samples_old)
|
57 |
+
|
58 |
+
with col2:
|
59 |
+
st.metric(label="Rate of change", value=curr_change, delta=delta)
|
60 |
|
61 |
# Tabs don't work in Spaces st version
|
62 |
#tab1, tab2, tab3, tab4, tab5, tab6, tab7, tab8 = st.tabs(["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super users", "Raw Data"])
|
|
|
65 |
'Topic of interest',
|
66 |
["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
|
67 |
|
|
|
68 |
if tab == "Language":
|
69 |
st.header("Languages info")
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def make_list(row):
|
72 |
languages = row["languages"]
|
73 |
if languages == "none":
|
|
|
75 |
return literal_eval(languages)
|
76 |
|
77 |
def language_count(row):
|
78 |
+
return len(row["languages"])
|
|
|
|
|
79 |
|
80 |
+
def process_for_lang(data):
|
81 |
+
# Remove rows without languages
|
82 |
+
data.loc[data.languages == "False", 'languages'] = None
|
83 |
+
data.loc[data.languages == {}, 'languages'] = None
|
84 |
|
85 |
+
# Count of rows that have no languages
|
86 |
+
no_lang_count = data["languages"].isna().sum()
|
|
|
|
|
87 |
|
88 |
+
# As the languages column might have multiple languages,
|
89 |
+
# we need to convert it to a list. We then count the number of languages.
|
90 |
+
data["languages"] = data["languages"].fillna('none')
|
91 |
+
data["languages"] = data.apply(make_list, axis=1)
|
92 |
+
data["language_count"] = data.apply(language_count, axis=1)
|
93 |
|
94 |
+
# Just keep the models with at least one language
|
95 |
+
models_with_langs = data[data["language_count"] > 0]
|
96 |
+
langs = models_with_langs["languages"].explode()
|
97 |
+
langs = langs[langs != {}]
|
98 |
+
total_langs = len(langs.unique())
|
99 |
+
|
100 |
+
data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
|
101 |
+
|
102 |
+
return data, no_lang_count, total_langs, langs.unique()
|
103 |
+
|
104 |
+
filtered_data = data.copy()
|
105 |
+
old_filtered_data = old_data.copy()
|
106 |
+
old_old_filtered_data = old_old_data.copy()
|
107 |
+
|
108 |
+
modality = st.selectbox(
|
109 |
+
'Modalities',
|
110 |
+
["All", "NLP", "Audio", "Multimodal"])
|
111 |
+
|
112 |
+
if modality == "NLP":
|
113 |
+
filtered_data = filtered_data[filtered_data["modality"] == "nlp"]
|
114 |
+
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "nlp"]
|
115 |
+
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "nlp"]
|
116 |
+
elif modality == "Audio":
|
117 |
+
filtered_data = filtered_data[filtered_data["modality"] == "audio"]
|
118 |
+
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "audio"]
|
119 |
+
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
|
120 |
+
elif modality == "Multimodal":
|
121 |
+
filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
|
122 |
+
old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
|
123 |
+
old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
|
124 |
+
|
125 |
+
|
126 |
+
filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
|
127 |
+
old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
|
128 |
+
old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
|
129 |
+
|
130 |
+
total_samples_filtered = filtered_data.shape[0]
|
131 |
+
total_samples_old_filtered = old_filtered_data.shape[0]
|
132 |
+
total_samples_old_old_filtered = old_old_filtered_data.shape[0]
|
133 |
+
v = total_samples_filtered-no_lang_count
|
134 |
+
v_old = total_samples_old_filtered-no_lang_count_old
|
135 |
+
v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
|
136 |
+
|
137 |
+
col1, col2 = st.columns(2)
|
138 |
with col1:
|
|
|
|
|
139 |
st.metric(label="Language Specified", value=v, delta=int(v-v_old))
|
140 |
with col2:
|
141 |
+
curr_change, delta = change_and_delta(v_old_old, v_old, v)
|
142 |
+
st.metric(label="Language Specified Rate of Change", value=curr_change, delta=delta)
|
143 |
+
|
144 |
+
col1, col2 = st.columns(2)
|
145 |
+
with col1:
|
146 |
st.metric(label="No Language Specified", value=no_lang_count, delta=int(no_lang_count-no_lang_count_old))
|
147 |
+
with col2:
|
148 |
+
curr_change, delta = change_and_delta(no_lang_count_old_old, no_lang_count_old, no_lang_count)
|
149 |
+
st.metric(label="No Language Specified Rate of Change", value=curr_change, delta=delta)
|
150 |
+
|
151 |
+
col1, col2 = st.columns(2)
|
152 |
+
with col1:
|
153 |
st.metric(label="Total Unique Languages", value=total_langs, delta=int(total_langs-total_langs_old))
|
154 |
+
with col2:
|
155 |
+
curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
|
156 |
+
st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
|
157 |
+
st.text(f"New languages {set(langs)-set(langs_old)}")
|
158 |
|
159 |
st.subheader("Count of languages per model repo")
|
160 |
st.text("Some repos are for multiple languages, so the count is greater than 1")
|
|
|
162 |
'All or just Multilingual',
|
163 |
["All", "Just Multilingual", "Three or more languages"])
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
def filter_multilinguality(data):
|
167 |
+
if linguality == "Just Multilingual":
|
168 |
+
multilingual_tag = data["multilingual"] == 1
|
169 |
+
multiple_lang_tags = data["language_count"] > 1
|
170 |
+
return data[multilingual_tag | multiple_lang_tags]
|
171 |
+
elif linguality == "Three or more languages":
|
172 |
+
return data[data["language_count"] >= 3]
|
173 |
+
else:
|
174 |
+
return data
|
175 |
+
|
176 |
+
models_with_langs = filter_multilinguality(filtered_data)
|
177 |
+
models_with_langs_old = filter_multilinguality(old_filtered_data)
|
178 |
+
|
179 |
df1 = models_with_langs['language_count'].value_counts()
|
|
|
180 |
df1_old = models_with_langs_old['language_count'].value_counts()
|
181 |
st.bar_chart(df1)
|
182 |
|
|
|
193 |
else:
|
194 |
filter = 2
|
195 |
|
196 |
+
models_with_langs = filtered_data[filtered_data["language_count"] > 0]
|
197 |
langs = models_with_langs["languages"].explode()
|
198 |
langs = langs[langs != {}]
|
199 |
orig_d = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
200 |
d = orig_d
|
201 |
|
202 |
+
models_with_langs_old = old_filtered_data[old_filtered_data["language_count"] > 0]
|
203 |
langs = models_with_langs_old["languages"].explode()
|
204 |
langs = langs[langs != {}]
|
205 |
orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
|
|
|
231 |
final_data = pd.merge(
|
232 |
d, orig_d_old, how="outer", on="language"
|
233 |
)
|
234 |
+
print(final_data["counts"].isna().sum())
|
235 |
+
print(final_data["old_c"].isna().sum())
|
236 |
final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
|
237 |
|
238 |
st.dataframe(final_data)
|
utils.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
import streamlit as st
|
3 |
+
from ast import literal_eval
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
nlp_tasks = ["text-classification", "text-generation", "text2text-generation", "token-classification", "fill-mask", "question-answering",
|
8 |
+
"translation", "conversational", "sentence-similarity", "summarization", "multiple-choice", "zero-shot-classification", "table-question-answering"
|
9 |
+
]
|
10 |
+
audio_tasks = ["automatic-speech-recognition", "audio-classification", "text-to-speech", "audio-to-audio", "voice-activity-detection"]
|
11 |
+
cv_tasks = ["image-classification", "image-segmentation", "zero-shot-image-classification", "image-to-image", "unconditional-image-generation", "object-detection"]
|
12 |
+
multimodal = ["feature-extraction", "text-to-image", "visual-question-answering", "image-to-text", "document-question-answering"]
|
13 |
+
tabular = ["tabular-classification", "tabular-regression"]
|
14 |
+
|
15 |
+
modalities = {
|
16 |
+
"nlp": nlp_tasks,
|
17 |
+
"audio": audio_tasks,
|
18 |
+
"cv": cv_tasks,
|
19 |
+
"multimodal": multimodal,
|
20 |
+
"tabular": tabular,
|
21 |
+
"rl": ["reinforcement-learning"]
|
22 |
+
}
|
23 |
+
|
24 |
+
def modality(row):
|
25 |
+
pipeline = row["pipeline"]
|
26 |
+
for modality, tasks in modalities.items():
|
27 |
+
if pipeline in tasks:
|
28 |
+
return modality
|
29 |
+
if type(pipeline) == "str":
|
30 |
+
return "unk_modality"
|
31 |
+
return None
|
32 |
+
|
33 |
+
st.cache(allow_output_mutation=True)
|
34 |
+
def process_dataset(version):
|
35 |
+
# Load dataset at specified revision
|
36 |
+
dataset = load_dataset("open-source-metrics/model-repos-stats", revision=version)
|
37 |
+
|
38 |
+
# Convert to pandas dataframe
|
39 |
+
data = dataset["train"].to_pandas()
|
40 |
+
|
41 |
+
# Add modality column
|
42 |
+
data["modality"] = data.apply(modality, axis=1)
|
43 |
+
|
44 |
+
# Bin the model card length into some bins
|
45 |
+
data["length_bins"] = pd.cut(data["text_length"], [0, 200, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 20000, 50000])
|
46 |
+
|
47 |
+
return data
|
48 |
+
|
49 |
+
def eval_tags(row):
|
50 |
+
tags = row["tags"]
|
51 |
+
if tags == "none" or tags == [] or tags == "{}":
|
52 |
+
return []
|
53 |
+
if tags[0] != "[":
|
54 |
+
tags = str([tags])
|
55 |
+
val = literal_eval(tags)
|
56 |
+
if isinstance(val, dict):
|
57 |
+
return []
|
58 |
+
return val
|
59 |
+
|
60 |
+
def change_pct(old, new):
|
61 |
+
return round(100* (new - old) / new, 3)
|
62 |
+
|
63 |
+
def change_and_delta(old_old, old, new):
|
64 |
+
curr_change = change_pct(old, new)
|
65 |
+
prev_change = change_pct(old_old, old)
|
66 |
+
delta = round(curr_change-prev_change, 3)
|
67 |
+
delta = f"{delta}%"
|
68 |
+
curr_change = f"{curr_change}%"
|
69 |
+
return curr_change, delta
|