osanseviero commited on
Commit
78f7e42
·
1 Parent(s): 6c21ae3

Release v2

Browse files
__pycache__/language.cpython-38.pyc ADDED
Binary file (1.6 kB). View file
 
__pycache__/pipelines.cpython-38.pyc ADDED
Binary file (1.5 kB). View file
 
__pycache__/utils.cpython-38.pyc ADDED
Binary file (2.59 kB). View file
 
changelog.md CHANGED
@@ -1,11 +1,26 @@
1
  Changelog
2
 
 
 
 
3
  v0.2 - Oct 24
4
  - Languages
5
  - Allow filtering for modality
6
- - Show new languages for the diff
7
  - Show rate of change in languages
8
  - Also include multilingual tag as multilingual for model selection in languages
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  v0.1
11
  - Allow pick comparison version
 
1
  Changelog
2
 
3
+ Planned
4
+ - Allow filtering just for the new models (no way to get this atm)
5
+
6
  v0.2 - Oct 24
7
  - Languages
8
  - Allow filtering for modality
9
+ - Show new and removed languages for the diff
10
  - Show rate of change in languages
11
  - Also include multilingual tag as multilingual for model selection in languages
12
+ - Spotted bug: False as a row in the dataset. To look into it
13
+ - License
14
+ - Add rate of change for top metrics
15
+ - Show lost and new licenses
16
+ - Pipelines
17
+ - Add rate of change for all metrics
18
+ - Fix bug that did not show new tags
19
+ - Add info per modality
20
+ - See new tags
21
+ - Pipeline breakdown by modality
22
+ - Discussions and Libraries
23
+ - Add rate of change for metrics
24
 
25
  v0.1
26
  - Allow pick comparison version
language.py CHANGED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ast import literal_eval
2
+
3
+ def make_lang_list(row):
4
+ languages = row["languages"]
5
+ if languages == "none":
6
+ return []
7
+ return literal_eval(languages)
8
+
9
+ def language_count(row):
10
+ return len(row["languages"])
11
+
12
+ def process_for_lang(data, modality):
13
+ # Filter by modality
14
+ if modality == "NLP":
15
+ data = data[data["modality"] == "nlp"]
16
+ elif modality == "Audio":
17
+ data = data[data["modality"] == "audio"]
18
+ elif modality == "Multimodal":
19
+ data = data[data["modality"] == "multimodal"]
20
+
21
+ # Remove rows without languages
22
+ data.loc[data.languages == "False", 'languages'] = None
23
+ data.loc[data.languages == {}, 'languages'] = None
24
+
25
+ # Count of rows that have no languages
26
+ no_lang_count = data["languages"].isna().sum()
27
+
28
+ # As the languages column might have multiple languages,
29
+ # we need to convert it to a list. We then count the number of languages.
30
+ data["languages"] = data["languages"].fillna('none')
31
+ data["languages"] = data.apply(make_lang_list, axis=1)
32
+ data["language_count"] = data.apply(language_count, axis=1)
33
+
34
+ # Just keep the models with at least one language
35
+ models_with_langs = data[data["language_count"] > 0]
36
+ langs = models_with_langs["languages"].explode()
37
+ langs = langs[langs != {}]
38
+ total_langs = len(langs.unique())
39
+
40
+ data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
41
+
42
+ return data, no_lang_count, total_langs, langs.unique()
43
+
44
+ def filter_multilinguality(data, linguality):
45
+ if linguality == "Just Multilingual":
46
+ multilingual_tag = data["multilingual"] == 1
47
+ multiple_lang_tags = data["language_count"] > 1
48
+ return data[multilingual_tag | multiple_lang_tags]
49
+ elif linguality == "Three or more languages":
50
+ return data[data["language_count"] >= 3]
51
+ else:
52
+ return data
models.py CHANGED
@@ -4,7 +4,9 @@ from ast import literal_eval
4
  import altair as alt
5
  import matplotlib.pyplot as plt
6
 
7
- from utils import process_dataset, eval_tags
 
 
8
 
9
  def main():
10
  # Pick revision at top
@@ -26,16 +28,6 @@ def main():
26
  supported_revisions,
27
  index=2)
28
 
29
- def change_pct(old, new):
30
- return round(100* (new - old) / new, 3)
31
-
32
- def change_and_delta(old_old, old, new):
33
- curr_change = change_pct(old, new)
34
- prev_change = change_pct(old_old, old)
35
- delta = f"{curr_change-prev_change}%"
36
- curr_change = f"{curr_change}%"
37
- return curr_change, delta
38
-
39
  # Process dataset
40
  old_old_data = process_dataset(base_old)
41
  old_data = process_dataset(base)
@@ -63,44 +55,11 @@ def main():
63
 
64
  tab = st.selectbox(
65
  'Topic of interest',
66
- ["Language", "License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
67
 
68
  if tab == "Language":
69
  st.header("Languages info")
70
 
71
- def make_list(row):
72
- languages = row["languages"]
73
- if languages == "none":
74
- return []
75
- return literal_eval(languages)
76
-
77
- def language_count(row):
78
- return len(row["languages"])
79
-
80
- def process_for_lang(data):
81
- # Remove rows without languages
82
- data.loc[data.languages == "False", 'languages'] = None
83
- data.loc[data.languages == {}, 'languages'] = None
84
-
85
- # Count of rows that have no languages
86
- no_lang_count = data["languages"].isna().sum()
87
-
88
- # As the languages column might have multiple languages,
89
- # we need to convert it to a list. We then count the number of languages.
90
- data["languages"] = data["languages"].fillna('none')
91
- data["languages"] = data.apply(make_list, axis=1)
92
- data["language_count"] = data.apply(language_count, axis=1)
93
-
94
- # Just keep the models with at least one language
95
- models_with_langs = data[data["language_count"] > 0]
96
- langs = models_with_langs["languages"].explode()
97
- langs = langs[langs != {}]
98
- total_langs = len(langs.unique())
99
-
100
- data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1)
101
-
102
- return data, no_lang_count, total_langs, langs.unique()
103
-
104
  filtered_data = data.copy()
105
  old_filtered_data = old_data.copy()
106
  old_old_filtered_data = old_old_data.copy()
@@ -109,30 +68,13 @@ def main():
109
  'Modalities',
110
  ["All", "NLP", "Audio", "Multimodal"])
111
 
112
- if modality == "NLP":
113
- filtered_data = filtered_data[filtered_data["modality"] == "nlp"]
114
- old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "nlp"]
115
- old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "nlp"]
116
- elif modality == "Audio":
117
- filtered_data = filtered_data[filtered_data["modality"] == "audio"]
118
- old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "audio"]
119
- old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "audio"]
120
- elif modality == "Multimodal":
121
- filtered_data = filtered_data[filtered_data["modality"] == "multimodal"]
122
- old_filtered_data = old_filtered_data[old_filtered_data["modality"] == "multimodal"]
123
- old_old_filtered_data = old_old_filtered_data[old_old_filtered_data["modality"] == "multimodal"]
124
-
125
-
126
- filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data)
127
- old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data)
128
- old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data)
129
-
130
- total_samples_filtered = filtered_data.shape[0]
131
- total_samples_old_filtered = old_filtered_data.shape[0]
132
- total_samples_old_old_filtered = old_old_filtered_data.shape[0]
133
- v = total_samples_filtered-no_lang_count
134
- v_old = total_samples_old_filtered-no_lang_count_old
135
- v_old_old = total_samples_old_old_filtered-no_lang_count_old_old
136
 
137
  col1, col2 = st.columns(2)
138
  with col1:
@@ -155,6 +97,7 @@ def main():
155
  curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
156
  st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
157
  st.text(f"New languages {set(langs)-set(langs_old)}")
 
158
 
159
  st.subheader("Count of languages per model repo")
160
  st.text("Some repos are for multiple languages, so the count is greater than 1")
@@ -162,19 +105,8 @@ def main():
162
  'All or just Multilingual',
163
  ["All", "Just Multilingual", "Three or more languages"])
164
 
165
-
166
- def filter_multilinguality(data):
167
- if linguality == "Just Multilingual":
168
- multilingual_tag = data["multilingual"] == 1
169
- multiple_lang_tags = data["language_count"] > 1
170
- return data[multilingual_tag | multiple_lang_tags]
171
- elif linguality == "Three or more languages":
172
- return data[data["language_count"] >= 3]
173
- else:
174
- return data
175
-
176
- models_with_langs = filter_multilinguality(filtered_data)
177
- models_with_langs_old = filter_multilinguality(old_filtered_data)
178
 
179
  df1 = models_with_langs['language_count'].value_counts()
180
  df1_old = models_with_langs_old['language_count'].value_counts()
@@ -185,14 +117,6 @@ def main():
185
  'All or filtered',
186
  ["All", "No English", "Remove top 10"])
187
 
188
- filter = 0
189
- if linguality_2 == "All":
190
- filter = 0
191
- elif linguality_2 == "No English":
192
- filter = 1
193
- else:
194
- filter = 2
195
-
196
  models_with_langs = filtered_data[filtered_data["language_count"] > 0]
197
  langs = models_with_langs["languages"].explode()
198
  langs = langs[langs != {}]
@@ -204,9 +128,9 @@ def main():
204
  langs = langs[langs != {}]
205
  orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
206
 
207
- if filter == 1:
208
  d = orig_d.iloc[1:]
209
- elif filter == 2:
210
  d = orig_d.iloc[10:]
211
 
212
  # Just keep top 25 to avoid vertical scroll
@@ -231,31 +155,51 @@ def main():
231
  final_data = pd.merge(
232
  d, orig_d_old, how="outer", on="language"
233
  )
234
- print(final_data["counts"].isna().sum())
235
- print(final_data["old_c"].isna().sum())
236
- final_data["diff"] = final_data["counts"].astype(int) - final_data["old_c"].astype(int)
237
-
238
  st.dataframe(final_data)
239
 
240
-
241
-
242
  #with tab2:
243
  if tab == "License":
244
  st.header("License info")
245
 
246
  no_license_count = data["license"].isna().sum()
247
  no_license_count_old = old_data["license"].isna().sum()
248
- col1, col2, col3 = st.columns(3)
 
 
 
249
  with col1:
250
  v = total_samples-no_license_count
251
  v_old = total_samples_old-no_license_count_old
252
  st.metric(label="License Specified", value=v, delta=int(v-v_old))
253
  with col2:
254
- st.metric(label="No license Specified", value=no_license_count, delta=int(no_license_count-no_license_count_old))
255
- with col3:
256
- unique_licenses = len(data["license"].unique())
257
- unique_licenses_old = len(old_data["license"].unique())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
 
 
 
 
 
259
 
260
  st.subheader("Distribution of licenses per model repo")
261
  license_filter = st.selectbox(
@@ -306,81 +250,65 @@ def main():
306
 
307
  tags_old = old_data["tags"].explode()
308
  tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
309
- s = tags_old["tag"]
310
- s = s[s.apply(type) == str]
311
- unique_tags_old = len(s.unique())
 
 
 
 
 
 
312
 
313
  no_pipeline_count = data["pipeline"].isna().sum()
314
  no_pipeline_count_old = old_data["pipeline"].isna().sum()
 
315
 
316
- col1, col2, col3 = st.columns(3)
 
 
 
317
  with col1:
318
- v = total_samples-no_pipeline_count
319
- v_old = total_samples_old-no_pipeline_count_old
320
  st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
321
  with col2:
 
 
 
 
 
322
  st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
323
- with col3:
 
 
 
 
 
324
  st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
 
 
 
325
 
326
- pipeline_filter = st.selectbox(
327
  'Modalities',
328
  ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
329
 
330
- filter = 0
331
- if pipeline_filter == "All":
332
- filter = 0
333
- elif pipeline_filter == "NLP":
334
- filter = 1
335
- elif pipeline_filter == "CV":
336
- filter = 2
337
- elif pipeline_filter == "Audio":
338
- filter = 3
339
- elif pipeline_filter == "RL":
340
- filter = 4
341
- elif pipeline_filter == "Multimodal":
342
- filter = 5
343
- elif pipeline_filter == "Tabular":
344
- filter = 6
345
-
346
  st.subheader("High-level metrics")
347
- filtered_data = data[data['pipeline'].notna()]
348
- filtered_data_old = old_data[old_data['pipeline'].notna()]
349
-
350
- if filter == 1:
351
- filtered_data = data[data["modality"] == "nlp"]
352
- filtered_data_old = old_data[old_data["modality"] == "nlp"]
353
- elif filter == 2:
354
- filtered_data = data[data["modality"] == "cv"]
355
- filtered_data_old = old_data[old_data["modality"] == "cv"]
356
- elif filter == 3:
357
- filtered_data = data[data["modality"] == "audio"]
358
- filtered_data_old = old_data[old_data["modality"] == "audio"]
359
- elif filter == 4:
360
- filtered_data = data[data["modality"] == "rl"]
361
- filtered_data_old = old_data[old_data["modality"] == "rl"]
362
- elif filter == 5:
363
- filtered_data = data[data["modality"] == "multimodal"]
364
- filtered_data_old = old_data[old_data["modality"] == "multimodal"]
365
- elif filter == 6:
366
- filtered_data = data[data["modality"] == "tabular"]
367
- filtered_data_old = old_data[old_data["modality"] == "tabular"]
368
 
369
  col1, col2, col3 = st.columns(3)
370
  with col1:
371
  p = st.selectbox(
372
  'What pipeline do you want to see?',
373
- ["all", *filtered_data["pipeline"].unique()]
374
  )
375
  with col2:
376
  l = st.selectbox(
377
  'What library do you want to see?',
378
- ["all", "not transformers", *filtered_data["library"].unique()]
379
  )
380
  with col3:
381
  f = st.selectbox(
382
- 'What framework support? (transformers)',
383
- ["all", "py", "tf", "jax"]
384
  )
385
 
386
  col1, col2 = st.columns(2)
@@ -393,49 +321,13 @@ def main():
393
  o = st.selectbox(
394
  label="Operation (for tags)",
395
  options=["Any", "All", "None"]
396
- )
397
-
398
- def filter_fn(row):
399
- tags = row["tags"]
400
- tags[:] = [d for d in tags if isinstance(d, str)]
401
- if o == "All":
402
- if all(elem in tags for elem in filt):
403
- return True
404
-
405
- s1 = set(tags)
406
- s2 = set(filt)
407
- if o == "Any":
408
- if bool(s1 & s2):
409
- return True
410
- if o == "None":
411
- if len(s1.intersection(s2)) == 0:
412
- return True
413
- return False
414
 
 
 
 
 
415
 
416
- if p != "all":
417
- filtered_data = filtered_data[filtered_data["pipeline"] == p]
418
- filtered_data_old = filtered_data_old[filtered_data_old["pipeline"] == p]
419
- if l != "all" and l != "not transformers":
420
- filtered_data = filtered_data[filtered_data["library"] == l]
421
- filtered_data_old = filtered_data_old[filtered_data_old["library"] == l]
422
- if l == "not transformers":
423
- filtered_data = filtered_data[filtered_data["library"] != "transformers"]
424
- filtered_data_old = filtered_data_old[filtered_data_old["library"] != "transformers"]
425
- if f != "all":
426
- if f == "py":
427
- filtered_data = filtered_data[filtered_data["pytorch"] == 1]
428
- filtered_data_old = filtered_data_old[filtered_data_old["pytorch"] == 1]
429
- elif f == "tf":
430
- filtered_data = filtered_data[filtered_data["tensorflow"] == 1]
431
- filtered_data_old = filtered_data_old[filtered_data_old["tensorflow"] == 1]
432
- elif f == "jax":
433
- filtered_data = filtered_data[filtered_data["jax"] == 1]
434
- filtered_data_old = filtered_data_old[filtered_data_old["jax"] == 1]
435
- if filt != []:
436
- filtered_data = filtered_data[filtered_data.apply(filter_fn, axis=1)]
437
- filtered_data_old = filtered_data_old[filtered_data_old.apply(filter_fn, axis=1)]
438
-
439
 
440
  d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
441
  columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
@@ -443,23 +335,45 @@ def main():
443
  final_data = pd.merge(
444
  d, grouped_data, how="outer", on="pipeline"
445
  )
446
- sums = grouped_data.sum()
447
 
448
  d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
449
  grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
450
  final_data_old = pd.merge(
451
  d_old, grouped_data_old, how="outer", on="pipeline"
452
  )
 
 
 
 
453
  sums = grouped_data.sum()
454
  sums_old = grouped_data_old.sum()
 
455
 
456
- col1, col2, col3 = st.columns(3)
 
 
 
457
  with col1:
458
- st.metric(label="Total models", value=filtered_data.shape[0], delta=int(filtered_data.shape[0] - filtered_data_old.shape[0]))
459
  with col2:
460
- st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"] - sums_old["downloads_30d"]))
 
461
  with col3:
 
 
 
 
 
 
 
 
 
 
462
  st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
 
 
 
 
463
 
464
  col1, col2, col3 = st.columns(3)
465
  with col1:
@@ -469,9 +383,41 @@ def main():
469
  with col3:
470
  st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
471
 
472
- st.metric(label="Unique Tags", value=unique_tags, delta=int(unique_tags - unique_tags_old))
 
 
 
 
473
 
474
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
476
  st.subheader("Count of models per pipeline")
477
  st.write(alt.Chart(d).mark_bar().encode(
@@ -511,8 +457,6 @@ def main():
511
  "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
512
  raw_data = filtered_data[columns_of_interest]
513
  st.dataframe(raw_data)
514
-
515
-
516
 
517
  # todo : add activity metric
518
 
@@ -524,6 +468,7 @@ def main():
524
  columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
525
  sums = data[columns_of_interest].sum()
526
  sums_old = old_data[columns_of_interest].sum()
 
527
 
528
  col1, col2, col3, col4 = st.columns(4)
529
  with col1:
@@ -535,6 +480,20 @@ def main():
535
  with col4:
536
  st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  col1, col2, col3 = st.columns(3)
539
  with col1:
540
  st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
@@ -543,6 +502,17 @@ def main():
543
  with col3:
544
  st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
545
 
 
 
 
 
 
 
 
 
 
 
 
546
  filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
547
  st.dataframe(filtered_data)
548
 
@@ -552,6 +522,7 @@ def main():
552
 
553
  no_library_count = data["library"].isna().sum()
554
  no_library_count_old = old_data["library"].isna().sum()
 
555
  col1, col2, col3 = st.columns(3)
556
  with col1:
557
  v = total_samples-no_library_count
@@ -564,6 +535,22 @@ def main():
564
  v_old = len(old_data["library"].unique())
565
  st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
 
568
  st.subheader("High-level metrics")
569
  filtered_data = data[data['library'].notna()]
@@ -623,8 +610,6 @@ def main():
623
  y=alt.X('library', sort=None)
624
  ))
625
 
626
-
627
-
628
  st.subheader("Aggregated Data")
629
  final_data = pd.merge(
630
  final_data, final_data_old, how="outer", on="library"
@@ -647,6 +632,7 @@ def main():
647
  columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
648
  rows = data.shape[0]
649
  rows_old = old_data.shape[0]
 
650
 
651
  cond = data["has_model_index"] | data["has_text"]
652
  with_model_card = data[cond]
@@ -656,31 +642,58 @@ def main():
656
  with_model_card_old = old_data[cond]
657
  c_model_card_old = with_model_card_old.shape[0]
658
 
 
 
 
 
659
  st.subheader("High-level metrics")
660
- col1, col2, col3 = st.columns(3)
661
  with col1:
662
- st.metric(label="# models with model card file", value=c_model_card, delta=int(c_model_card-c_model_card_old))
663
  with col2:
664
- st.metric(label="# models without model card file", value=rows-c_model_card, delta=int((rows-c_model_card)-(rows_old-c_model_card_old)))
665
-
 
 
 
 
 
 
666
  with_index = data["has_model_index"].sum()
667
  with_index_old = old_data["has_model_index"].sum()
 
668
  with col1:
669
- st.metric(label="# models with model index", value=with_index, delta=int(with_index-with_index_old))
670
  with col2:
671
- st.metric(label="# models without model index", value=rows-with_index, delta=int((rows-with_index)-(rows_old-with_index_old)))
 
 
 
 
 
 
672
 
673
  with_text = data["has_text"]
674
  with_text_old = old_data["has_text"]
 
 
 
 
 
675
  with col1:
676
- st.metric(label="# models with model card text", value=with_text.sum(), delta=int(with_text.sum()-with_text_old.sum()))
677
  with col2:
678
- st.metric(label="# models without model card text", value=rows-with_text.sum(), delta=int((rows-with_text.sum())-(rows_old-with_text_old.sum())))
 
 
 
 
 
 
679
 
680
-
681
  st.subheader("Length (chars) of model card content")
682
- fig, ax = plt.subplots()
683
- ax = data["length_bins"].value_counts().plot.bar()
684
  st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
685
  st.pyplot(fig)
686
 
 
4
  import altair as alt
5
  import matplotlib.pyplot as plt
6
 
7
+ from utils import process_dataset, eval_tags, change_and_delta
8
+ from language import process_for_lang, filter_multilinguality
9
+ from pipelines import filter_pipeline_data
10
 
11
  def main():
12
  # Pick revision at top
 
28
  supported_revisions,
29
  index=2)
30
 
 
 
 
 
 
 
 
 
 
 
31
  # Process dataset
32
  old_old_data = process_dataset(base_old)
33
  old_data = process_dataset(base)
 
55
 
56
  tab = st.selectbox(
57
  'Topic of interest',
58
+ ["Language","License", "Pipeline", "Discussion Features", "Libraries", "Model Cards", "Super Users", "Raw Data"])
59
 
60
  if tab == "Language":
61
  st.header("Languages info")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  filtered_data = data.copy()
64
  old_filtered_data = old_data.copy()
65
  old_old_filtered_data = old_old_data.copy()
 
68
  'Modalities',
69
  ["All", "NLP", "Audio", "Multimodal"])
70
 
71
+ filtered_data, no_lang_count, total_langs, langs = process_for_lang(filtered_data, modality)
72
+ old_filtered_data, no_lang_count_old, total_langs_old, langs_old = process_for_lang(old_filtered_data, modality)
73
+ old_old_filtered_data, no_lang_count_old_old, total_langs_old_old, _ = process_for_lang(old_old_filtered_data, modality)
74
+
75
+ v = filtered_data.shape[0]-no_lang_count
76
+ v_old = old_filtered_data.shape[0]-no_lang_count_old
77
+ v_old_old = old_old_filtered_data.shape[0]-no_lang_count_old_old
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  col1, col2 = st.columns(2)
80
  with col1:
 
97
  curr_change, delta = change_and_delta(total_langs_old_old, total_langs_old, total_langs)
98
  st.metric(label="Total Unique Languages Rate of Change", value=curr_change, delta=delta)
99
  st.text(f"New languages {set(langs)-set(langs_old)}")
100
+ st.text(f"Lost languages {set(langs_old)-set(langs)}")
101
 
102
  st.subheader("Count of languages per model repo")
103
  st.text("Some repos are for multiple languages, so the count is greater than 1")
 
105
  'All or just Multilingual',
106
  ["All", "Just Multilingual", "Three or more languages"])
107
 
108
+ models_with_langs = filter_multilinguality(filtered_data, linguality)
109
+ models_with_langs_old = filter_multilinguality(old_filtered_data, linguality)
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  df1 = models_with_langs['language_count'].value_counts()
112
  df1_old = models_with_langs_old['language_count'].value_counts()
 
117
  'All or filtered',
118
  ["All", "No English", "Remove top 10"])
119
 
 
 
 
 
 
 
 
 
120
  models_with_langs = filtered_data[filtered_data["language_count"] > 0]
121
  langs = models_with_langs["languages"].explode()
122
  langs = langs[langs != {}]
 
128
  langs = langs[langs != {}]
129
  orig_d_old = langs.value_counts().rename_axis("language").to_frame('counts').reset_index()
130
 
131
+ if linguality_2 == "No English":
132
  d = orig_d.iloc[1:]
133
+ elif linguality_2 == "Remove top 10":
134
  d = orig_d.iloc[10:]
135
 
136
  # Just keep top 25 to avoid vertical scroll
 
155
  final_data = pd.merge(
156
  d, orig_d_old, how="outer", on="language"
157
  )
158
+ final_data['counts'] = final_data['counts'].fillna(0).astype(int)
159
+ final_data['old_c'] = final_data['old_c'].fillna(0).astype(int)
160
+ final_data["diff"] = final_data["counts"] - final_data["old_c"]
161
+ final_data['language'] = final_data['language'].astype(str)
162
  st.dataframe(final_data)
163
 
 
 
164
  #with tab2:
165
  if tab == "License":
166
  st.header("License info")
167
 
168
  no_license_count = data["license"].isna().sum()
169
  no_license_count_old = old_data["license"].isna().sum()
170
+ no_license_count_old_old = old_old_data["license"].isna().sum()
171
+
172
+
173
+ col1, col2 = st.columns(2)
174
  with col1:
175
  v = total_samples-no_license_count
176
  v_old = total_samples_old-no_license_count_old
177
  st.metric(label="License Specified", value=v, delta=int(v-v_old))
178
  with col2:
179
+ v = total_samples-no_license_count
180
+ v_old = total_samples_old-no_license_count_old
181
+ v_old_old = total_samples_old-no_license_count_old_old
182
+ curr_change, delta = change_and_delta(v_old_old, v_old, v)
183
+ st.metric(label="License Specified Rate of Change", value=curr_change, delta=delta)
184
+
185
+ col1, col2 = st.columns(2)
186
+ with col1:
187
+ st.metric(label="No License Specified", value=no_license_count, delta=int(no_license_count-no_license_count_old))
188
+ with col2:
189
+ curr_change, delta = change_and_delta(no_license_count_old_old, no_license_count_old, no_license_count)
190
+ st.metric(label="No License Specified Rate of Change", value=curr_change, delta=delta)
191
+
192
+ col1, col2 = st.columns(2)
193
+ unique_licenses = len(data["license"].unique())
194
+ unique_licenses_old = len(old_data["license"].unique())
195
+ unique_licenses_old_old = len(old_old_data["license"].unique())
196
+ with col1:
197
  st.metric(label="Total Unique Licenses", value=unique_licenses, delta=int(unique_licenses-unique_licenses_old))
198
+ with col2:
199
+ curr_change, delta = change_and_delta(unique_licenses_old_old, unique_licenses_old, unique_licenses)
200
+ st.metric(label="Total Unique Licenses Rate of Change", value=curr_change, delta=delta)
201
+ st.text(f"New licenses {set(data['license'].unique())-set(old_data['license'].unique())}")
202
+ st.text(f"Old licenses {set(old_data['license'].unique())-set(data['license'].unique())}")
203
 
204
  st.subheader("Distribution of licenses per model repo")
205
  license_filter = st.selectbox(
 
250
 
251
  tags_old = old_data["tags"].explode()
252
  tags_old = tags_old[tags_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
253
+ s_o = tags_old["tag"]
254
+ s_o = s_o[s_o.apply(type) == str]
255
+ unique_tags_old = len(s_o.unique())
256
+
257
+ tags_old_old = old_old_data["tags"].explode()
258
+ tags_old_old = tags_old_old[tags_old_old.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
259
+ s_old_old = tags_old_old["tag"]
260
+ s_old_old = s_old_old[s_old_old.apply(type) == str]
261
+ unique_tags_old_old = len(s_old_old.unique())
262
 
263
  no_pipeline_count = data["pipeline"].isna().sum()
264
  no_pipeline_count_old = old_data["pipeline"].isna().sum()
265
+ no_pipeline_count_old_old = old_old_data["pipeline"].isna().sum()
266
 
267
+ col1, col2 = st.columns(2)
268
+ v = total_samples-no_pipeline_count
269
+ v_old = total_samples_old-no_pipeline_count_old
270
+ v_old_old = total_samples_old_old-no_pipeline_count_old_old
271
  with col1:
 
 
272
  st.metric(label="# models that have any pipeline", value=v, delta=int(v-v_old))
273
  with col2:
274
+ curr_change, delta = change_and_delta(v_old_old, v_old, v)
275
+ st.metric(label="# models rate of change", value=curr_change, delta=delta)
276
+
277
+ col1, col2 = st.columns(2)
278
+ with col1:
279
  st.metric(label="No pipeline Specified", value=no_pipeline_count, delta=int(no_pipeline_count-no_pipeline_count_old))
280
+ with col2:
281
+ curr_change, delta = change_and_delta(no_pipeline_count_old_old, no_pipeline_count_old, no_pipeline_count)
282
+ st.metric(label="No pipeline Specified rate of change", value=curr_change, delta=delta)
283
+
284
+ col1, col2 = st.columns(2)
285
+ with col1:
286
  st.metric(label="Total Unique Tags", value=unique_tags, delta=int(unique_tags-unique_tags_old))
287
+ with col2:
288
+ curr_change, delta = change_and_delta(unique_tags_old_old, unique_tags_old, unique_tags)
289
+ st.metric(label="Total Unique Tags", value=curr_change, delta=delta)
290
 
291
+ modality_filter = st.selectbox(
292
  'Modalities',
293
  ["All", "NLP", "CV", "Audio", "RL", "Multimodal", "Tabular"])
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  st.subheader("High-level metrics")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  col1, col2, col3 = st.columns(3)
298
  with col1:
299
  p = st.selectbox(
300
  'What pipeline do you want to see?',
301
+ ["all", *data["pipeline"].unique()]
302
  )
303
  with col2:
304
  l = st.selectbox(
305
  'What library do you want to see?',
306
+ ["all", "not transformers", *data["library"].unique()]
307
  )
308
  with col3:
309
  f = st.selectbox(
310
+ 'What trf framework support?',
311
+ ["all", "pytorch", "tensorflow", "jax"]
312
  )
313
 
314
  col1, col2 = st.columns(2)
 
321
  o = st.selectbox(
322
  label="Operation (for tags)",
323
  options=["Any", "All", "None"]
324
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
+ filtered_data, tags = filter_pipeline_data(data, modality_filter, p, l, f, filt, o)
327
+ filtered_data_old, old_tags = filter_pipeline_data(old_data, modality_filter, p, l, f, filt, o)
328
+ filtered_data_old_old, old_old_tags = filter_pipeline_data(old_old_data, modality_filter, p, l, f, filt, o)
329
+ st.subheader("Pipeline breakdown")
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
  d = filtered_data["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
333
  columns_of_interest = ["downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
 
335
  final_data = pd.merge(
336
  d, grouped_data, how="outer", on="pipeline"
337
  )
 
338
 
339
  d_old = filtered_data_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
340
  grouped_data_old = filtered_data_old.groupby("pipeline").sum()[columns_of_interest]
341
  final_data_old = pd.merge(
342
  d_old, grouped_data_old, how="outer", on="pipeline"
343
  )
344
+
345
+ d_old = filtered_data_old_old["pipeline"].value_counts().rename_axis("pipeline").to_frame('counts').reset_index()
346
+ grouped_data_old_old = filtered_data_old_old.groupby("pipeline").sum()[columns_of_interest]
347
+
348
  sums = grouped_data.sum()
349
  sums_old = grouped_data_old.sum()
350
+ sums_old_old = grouped_data_old_old.sum()
351
 
352
+ col1, col2, col3, col4 = st.columns(4)
353
+ v = filtered_data.shape[0]
354
+ v_old = filtered_data_old.shape[0]
355
+ v_old_old = filtered_data_old_old.shape[0]
356
  with col1:
357
+ st.metric(label="Total models", value=v, delta=int(v - v_old))
358
  with col2:
359
+ curr_change, delta = change_and_delta(v_old_old, v_old, v)
360
+ st.metric(label="Total models rate of change", value=curr_change, delta=delta)
361
  with col3:
362
+ st.metric(label="Cumulative Downloads (30d)", value=sums["downloads_30d"], delta=int(sums["downloads_30d"] - sums_old["downloads_30d"]))
363
+ with col4:
364
+ print(sums_old_old["downloads_30d"], sums_old["downloads_30d"], sums["downloads_30d"])
365
+ curr_change, delta = change_and_delta(sums_old_old["downloads_30d"], sums_old["downloads_30d"], sums["downloads_30d"])
366
+ st.metric(label="Cumulative Downloads (30d) rate of change", value=curr_change, delta=delta)
367
+
368
+ col1, col2, col3 = st.columns(3)
369
+ with col1:
370
+ st.metric(label="Total unique pipelines", value=len(filtered_data["pipeline"].unique()))
371
+ with col2:
372
  st.metric(label="Cumulative likes", value=sums["likes"], delta=int(sums["likes"] - sums_old["likes"]))
373
+ with col3:
374
+ curr_change, delta = change_and_delta(sums_old_old["likes"], sums_old["likes"], sums["likes"])
375
+ st.metric(label="Cumulative Likes rate of change", value=curr_change, delta=delta)
376
+
377
 
378
  col1, col2, col3 = st.columns(3)
379
  with col1:
 
383
  with col3:
384
  st.metric(label="Total in JAX", value=sums["jax"], delta=int(sums["jax"] - sums_old["jax"]))
385
 
386
+ col1, col2 = st.columns(2)
387
+ with col1:
388
+ st.metric(label="Total unique libraries", value=len(filtered_data["library"].unique()))
389
+ with col2:
390
+ st.metric(label="Total unique modality", value=len(filtered_data["modality"].unique()))
391
 
392
+
393
+ col1, col2 = st.columns(2)
394
+ with col1:
395
+ st.metric(label="Total transformers models", value=len(filtered_data[filtered_data["library"] == "transformers"]))
396
+ with col2:
397
+ st.metric(label="Total non transformers models", value=len(filtered_data[filtered_data["library"] != "transformers"]))
398
+
399
+ st.metric(label="Unique Tags", value=len(tags), delta=int(len(tags) - len(old_tags)))
400
+ st.text(f"New tags {set(tags)-set(old_tags)}")
401
+ st.text(f"Lost tags {set(old_tags)-set(tags)}")
402
+
403
+ st.subheader("Pipeline breakdown by modality")
404
+ col1, col2 = st.columns(2)
405
+ with col1:
406
+ st.metric(label="Total CV models", value=len(filtered_data[filtered_data["modality"] == "cv"]))
407
+ with col2:
408
+ st.metric(label="Total NLP models", value=len(filtered_data[filtered_data["modality"] == "nlp"]))
409
+
410
+ col1, col2 = st.columns(2)
411
+ with col1:
412
+ st.metric(label="Total Audio models", value=len(filtered_data[filtered_data["modality"] == "audio"]))
413
+ with col2:
414
+ st.metric(label="Total RL models", value=len(filtered_data[filtered_data["modality"] == "rl"]))
415
+
416
+ col1, col2 = st.columns(2)
417
+ with col1:
418
+ st.metric(label="Total Tabular models", value=len(filtered_data[filtered_data["modality"] == "tabular"]))
419
+ with col2:
420
+ st.metric(label="Total Multimodal models", value=len(filtered_data[filtered_data["modality"] == "multimodal"]))
421
 
422
  st.subheader("Count of models per pipeline")
423
  st.write(alt.Chart(d).mark_bar().encode(
 
457
  "downloads_30d", "likes", "pytorch", "tensorflow", "jax"]
458
  raw_data = filtered_data[columns_of_interest]
459
  st.dataframe(raw_data)
 
 
460
 
461
  # todo : add activity metric
462
 
 
468
  columns_of_interest = ["prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]
469
  sums = data[columns_of_interest].sum()
470
  sums_old = old_data[columns_of_interest].sum()
471
+ sums_old_old = old_old_data[columns_of_interest].sum()
472
 
473
  col1, col2, col3, col4 = st.columns(4)
474
  with col1:
 
480
  with col4:
481
  st.metric(label="PRs closed", value=sums["prs_closed"], delta=int(sums["prs_closed"] - sums_old["prs_closed"]))
482
 
483
+ col1, col2, col3, col4 = st.columns(4)
484
+ with col1:
485
+ curr_change, delta = change_and_delta(sums_old_old["prs_count"], sums_old["prs_count"], sums["prs_count"])
486
+ st.metric(label="Total PRs change", value=curr_change,delta=delta)
487
+ with col2:
488
+ curr_change, delta = change_and_delta(sums_old_old["prs_open"], sums_old["prs_open"], sums["prs_open"])
489
+ st.metric(label="PRs opened change", value=curr_change,delta=delta)
490
+ with col3:
491
+ curr_change, delta = change_and_delta(sums_old_old["prs_merged"], sums_old["prs_merged"], sums["prs_merged"])
492
+ st.metric(label="PRs merged change", value=curr_change,delta=delta)
493
+ with col4:
494
+ curr_change, delta = change_and_delta(sums_old_old["prs_closed"], sums_old["prs_closed"], sums["prs_closed"])
495
+ st.metric(label="PRs closed change", value=curr_change,delta=delta)
496
+
497
  col1, col2, col3 = st.columns(3)
498
  with col1:
499
  st.metric(label="Total discussions", value=sums["discussions_count"], delta=int(sums["discussions_count"] - sums_old["discussions_count"]))
 
502
  with col3:
503
  st.metric(label="Discussions closed", value=sums["discussions_closed"], delta=int(sums["discussions_closed"] - sums_old["discussions_closed"]))
504
 
505
+ col1, col2, col3 = st.columns(3)
506
+ with col1:
507
+ curr_change, delta = change_and_delta(sums_old_old["discussions_count"], sums_old["discussions_count"], sums["discussions_count"])
508
+ st.metric(label="Total discussions change", value=curr_change,delta=delta)
509
+ with col2:
510
+ curr_change, delta = change_and_delta(sums_old_old["discussions_open"], sums_old["discussions_open"], sums["discussions_open"])
511
+ st.metric(label="Discussions open change", value=curr_change,delta=delta)
512
+ with col3:
513
+ curr_change, delta = change_and_delta(sums_old_old["discussions_closed"], sums_old["discussions_closed"], sums["discussions_closed"])
514
+ st.metric(label="Discussions closed change", value=curr_change,delta=delta)
515
+
516
  filtered_data = data[["repo_id", "prs_count", "prs_open", "prs_merged", "prs_closed", "discussions_count", "discussions_open", "discussions_closed"]].sort_values("prs_count", ascending=False).reset_index(drop=True)
517
  st.dataframe(filtered_data)
518
 
 
522
 
523
  no_library_count = data["library"].isna().sum()
524
  no_library_count_old = old_data["library"].isna().sum()
525
+ no_library_count_old_old = old_old_data["library"].isna().sum()
526
  col1, col2, col3 = st.columns(3)
527
  with col1:
528
  v = total_samples-no_library_count
 
535
  v_old = len(old_data["library"].unique())
536
  st.metric(label="Total Unique library", value=v, delta=int(v-v_old))
537
 
538
+ col1, col2, col3 = st.columns(3)
539
+ with col1:
540
+ v = total_samples-no_library_count
541
+ v_old = total_samples_old-no_library_count_old
542
+ v_old_old = total_samples_old_old-no_library_count_old_old
543
+ curr_change, delta = change_and_delta(v_old_old, v_old, v)
544
+ st.metric(label="# models that have any library change", value=curr_change, delta=delta)
545
+ with col2:
546
+ curr_change, delta = change_and_delta(no_library_count_old_old, no_library_count_old, no_library_count)
547
+ st.metric(label="No library Specified Change", value=curr_change, delta=delta)
548
+ with col3:
549
+ v = len(data["library"].unique())
550
+ v_old = len(old_data["library"].unique())
551
+ v_old_old = len(old_old_data["library"].unique())
552
+ curr_change, delta = change_and_delta(v_old_old, v_old, v)
553
+ st.metric(label="Total Unique library", value=curr_change, delta=delta)
554
 
555
  st.subheader("High-level metrics")
556
  filtered_data = data[data['library'].notna()]
 
610
  y=alt.X('library', sort=None)
611
  ))
612
 
 
 
613
  st.subheader("Aggregated Data")
614
  final_data = pd.merge(
615
  final_data, final_data_old, how="outer", on="library"
 
632
  columns_of_interest = ["has_model_index", "has_metadata", "has_text", "text_length"]
633
  rows = data.shape[0]
634
  rows_old = old_data.shape[0]
635
+ rows_old_old = old_old_data.shape[0]
636
 
637
  cond = data["has_model_index"] | data["has_text"]
638
  with_model_card = data[cond]
 
642
  with_model_card_old = old_data[cond]
643
  c_model_card_old = with_model_card_old.shape[0]
644
 
645
+ cond = old_old_data["has_model_index"] | old_old_data["has_text"]
646
+ with_model_card_old_old = old_old_data[cond]
647
+ c_model_card_old_old = with_model_card_old_old.shape[0]
648
+
649
  st.subheader("High-level metrics")
650
+ col1, col2, col3, col4 = st.columns(4)
651
  with col1:
652
+ st.metric(label="# with model card file", value=c_model_card, delta=int(c_model_card-c_model_card_old))
653
  with col2:
654
+ curr_change, delta = change_and_delta(c_model_card_old_old, c_model_card_old, c_model_card)
655
+ st.metric(label="# with model card file change", value=curr_change, delta=delta)
656
+ with col3:
657
+ st.metric(label="# without model card file", value=rows-c_model_card, delta=int((rows-c_model_card)-(rows_old-c_model_card_old)))
658
+ with col4:
659
+ curr_change, delta = change_and_delta(rows_old_old-c_model_card_old_old, rows_old-c_model_card_old, rows-c_model_card)
660
+ st.metric(label="# without model card file change", value=curr_change, delta=delta)
661
+
662
  with_index = data["has_model_index"].sum()
663
  with_index_old = old_data["has_model_index"].sum()
664
+ with_index_old_old = old_old_data["has_model_index"].sum()
665
  with col1:
666
+ st.metric(label="# with model index", value=with_index, delta=int(with_index-with_index_old))
667
  with col2:
668
+ curr_change, delta = change_and_delta(with_index_old_old, with_index_old, with_index)
669
+ st.metric(label="# with model index change", value=curr_change, delta=delta)
670
+ with col3:
671
+ st.metric(label="# without model index", value=rows-with_index, delta=int((rows-with_index)-(rows_old-with_index_old)))
672
+ with col4:
673
+ curr_change, delta = change_and_delta(rows_old_old-with_index_old_old, rows_old-with_index_old, rows-with_index)
674
+ st.metric(label="# without model index change", value=curr_change, delta=delta)
675
 
676
  with_text = data["has_text"]
677
  with_text_old = old_data["has_text"]
678
+ with_text_old_old = old_old_data["has_text"]
679
+
680
+ with_text_sum = with_text.sum()
681
+ with_text_old_sum = with_text_old.sum()
682
+ with_text_old_old_sum = with_text_old_old.sum()
683
  with col1:
684
+ st.metric(label="# with model card text", value=with_text_sum, delta=int(with_text_sum-with_text_old_sum))
685
  with col2:
686
+ curr_change, delta = change_and_delta(with_text_old_old_sum, with_text_old_sum, with_text_sum)
687
+ st.metric(label="# with model card text change", value=curr_change, delta=delta)
688
+ with col3:
689
+ st.metric(label="# without card text", value=rows-with_text_sum, delta=int((rows-with_text_sum)-(with_text_old_sum)))
690
+ with col4:
691
+ curr_change, delta = change_and_delta(rows_old_old-with_text_old_old_sum, rows_old-with_text_old_sum, rows-with_text_sum)
692
+ st.metric(label="# without card text change", value=curr_change, delta=delta)
693
 
 
694
  st.subheader("Length (chars) of model card content")
695
+ fig, _ = plt.subplots()
696
+ _ = data["length_bins"].value_counts().plot.bar()
697
  st.metric(label="# average length of model card (chars)", value=data[with_text]["text_length"].mean())
698
  st.pyplot(fig)
699
 
pipelines.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def filter_tags(row, filt, operator):
2
+ tags = row["tags"]
3
+ tags[:] = [d for d in tags if isinstance(d, str)]
4
+ if operator == "All":
5
+ if all(elem in tags for elem in filt):
6
+ return True
7
+
8
+ s1 = set(tags)
9
+ s2 = set(filt)
10
+ if operator == "Any":
11
+ if bool(s1 & s2):
12
+ return True
13
+ if operator == "None":
14
+ if len(s1.intersection(s2)) == 0:
15
+ return True
16
+ return False
17
+
18
+
19
+ def filter_pipeline_data(data, modality, pipeline, library,framework, tags, operator):
20
+ data = data[data['pipeline'].notna()]
21
+
22
+ if modality != "All":
23
+ data = data[data["modality"] == modality.lower()]
24
+
25
+ if pipeline != "all":
26
+ data = data[data["pipeline"] == pipeline]
27
+
28
+ if library != "all" and library != "not transformers":
29
+ data = data[data["library"] == library]
30
+ if library == "not transformers":
31
+ data = data[data["library"] != "transformers"]
32
+
33
+ if framework != "all":
34
+ print(framework)
35
+ data = data[data[framework] == 1]
36
+
37
+ if tags != []:
38
+ data = data[data.apply(filter_tags, axis=1, filt=tags, operator=operator)]
39
+
40
+ tags = data["tags"].explode()
41
+ tags = tags[tags.notna()].value_counts().rename_axis("tag").to_frame('counts').reset_index()
42
+ s = tags["tag"]
43
+ s = s[s.apply(type) == str]
44
+
45
+ return data, s.unique()
utils.py CHANGED
@@ -58,12 +58,15 @@ def eval_tags(row):
58
  return val
59
 
60
  def change_pct(old, new):
 
 
61
  return round(100* (new - old) / new, 3)
62
 
63
  def change_and_delta(old_old, old, new):
64
  curr_change = change_pct(old, new)
65
  prev_change = change_pct(old_old, old)
66
  delta = round(curr_change-prev_change, 3)
67
- delta = f"{delta}%"
 
68
  curr_change = f"{curr_change}%"
69
  return curr_change, delta
 
58
  return val
59
 
60
  def change_pct(old, new):
61
+ if new == 0:
62
+ return -10000000
63
  return round(100* (new - old) / new, 3)
64
 
65
  def change_and_delta(old_old, old, new):
66
  curr_change = change_pct(old, new)
67
  prev_change = change_pct(old_old, old)
68
  delta = round(curr_change-prev_change, 3)
69
+ if delta > 0:
70
+ delta = f"+{delta}%"
71
  curr_change = f"{curr_change}%"
72
  return curr_change, delta