Tom Aarsen commited on
Commit
bd6a61b
·
1 Parent(s): 5c90ee9

Compute model size based on number of parameters

Browse files
Files changed (3) hide show
  1. app.py +100 -126
  2. utils/__init__.py +0 -0
  3. utils/model_size.py +39 -0
app.py CHANGED
@@ -4,11 +4,13 @@ import os
4
 
5
  from datasets import load_dataset
6
  import gradio as gr
7
- from huggingface_hub import get_hf_file_metadata, HfApi, hf_hub_download, hf_hub_url
8
  from huggingface_hub.repocard import metadata_load
9
  import pandas as pd
10
  from tqdm.autonotebook import tqdm
11
 
 
 
12
  TASKS = [
13
  "BitextMining",
14
  "Classification",
@@ -786,94 +788,94 @@ EXTERNAL_MODEL_TO_SEQLEN = {
786
  }
787
 
788
  EXTERNAL_MODEL_TO_SIZE = {
789
- "allenai-specter": 0.44,
790
- "all-MiniLM-L12-v2": 0.13,
791
- "all-MiniLM-L6-v2": 0.09,
792
- "all-mpnet-base-v2": 0.44,
793
- "bert-base-10lang-cased": 0.61,
794
- "bert-base-15lang-cased": 0.61,
795
- "bert-base-25lang-cased": 0.61,
796
- "bert-base-multilingual-cased": 0.71,
797
- "bert-base-multilingual-uncased": 0.67,
798
- "bert-base-uncased": 0.44,
799
- "bert-base-swedish-cased": 0.50,
800
- "bge-base-zh-v1.5": 0.41,
801
- "bge-large-zh-v1.5": 1.30,
802
- "bge-large-zh-noinstruct": 1.30,
803
- "bge-small-zh-v1.5": 0.10,
804
- "camembert-base": 0.45,
805
- "camembert-large": 1.35,
806
- "cross-en-de-roberta-sentence-transformer": 1.11,
807
- "contriever-base-msmarco": 0.44,
808
- "distilbert-base-25lang-cased": 0.44,
809
- "distilbert-base-en-fr-cased": 0.44,
810
- "distilbert-base-en-fr-es-pt-it-cased": 0.44,
811
- "distilbert-base-fr-cased": 0.44,
812
- "distilbert-base-uncased": 0.44,
813
- "DanskBERT": 0.50,
814
- "distiluse-base-multilingual-cased-v2": 0.54,
815
- "dfm-encoder-large-v1": 1.42,
816
- "dfm-sentence-encoder-large-1": 1.63,
817
- "e5-base": 0.44,
818
- "e5-large": 1.34,
819
- "e5-mistral-7b-instruct": 14.22,
820
- "e5-small": 0.13,
821
- "electra-small-nordic": 0.09,
822
- "electra-small-swedish-cased-discriminator": 0.06,
823
- "flaubert_base_cased": 0.55,
824
- "flaubert_base_uncased": 0.55,
825
- "flaubert_large_cased": 1.49,
826
- "gbert-base": 0.44,
827
- "gbert-large": 1.35,
828
- "gelectra-base": 0.44,
829
- "gelectra-large": 1.34,
830
- "glove.6B.300d": 0.48,
831
- "gottbert-base": 0.51,
832
- "gtr-t5-base": 0.22,
833
- "gtr-t5-large": 0.67,
834
- "gtr-t5-xl": 2.48,
835
- "gtr-t5-xxl": 9.73,
836
- "herbert-base-retrieval-v2": 0.50,
837
- "komninos": 0.27,
838
- "luotuo-bert-medium": 1.31,
839
- "LASER2": 0.17,
840
- "LaBSE": 1.88,
841
- "m3e-base": 0.41,
842
- "m3e-large": 0.41,
843
- "msmarco-bert-co-condensor": 0.44,
844
- "multi-qa-MiniLM-L6-cos-v1": 0.09,
845
- "multilingual-e5-base": 1.11,
846
- "multilingual-e5-small": 0.47,
847
- "multilingual-e5-large": 2.24,
848
- "nb-bert-base": 0.71,
849
- "nb-bert-large": 1.42,
850
- "nomic-embed-text-v1.5-64": 0.55,
851
- "nomic-embed-text-v1.5-128": 0.55,
852
- "nomic-embed-text-v1.5-256": 0.55,
853
- "nomic-embed-text-v1.5-512": 0.55,
854
- "norbert3-base": 0.52,
855
- "norbert3-large": 1.47,
856
- "paraphrase-multilingual-mpnet-base-v2": 1.11,
857
- "paraphrase-multilingual-MiniLM-L12-v2": 0.47,
858
- "sentence-camembert-base": 0.44,
859
- "sentence-camembert-large": 1.35,
860
- "sentence-croissant-llm-base": 5.12,
861
- "sentence-bert-swedish-cased": 0.50,
862
- "sentence-t5-base": 0.22,
863
- "sentence-t5-large": 0.67,
864
- "sentence-t5-xl": 2.48,
865
- "sentence-t5-xxl": 9.73,
866
- "silver-retriever-base-v1": 0.50,
867
- "sup-simcse-bert-base-uncased": 0.44,
868
- "st-polish-paraphrase-from-distilroberta": 0.50,
869
- "st-polish-paraphrase-from-mpnet": 0.50,
870
- "text2vec-base-chinese": 0.41,
871
- "text2vec-large-chinese": 1.30,
872
- "unsup-simcse-bert-base-uncased": 0.44,
873
- "use-cmlm-multilingual": 1.89,
874
- "voyage-lite-02-instruct": 2.45,
875
- "xlm-roberta-base": 1.12,
876
- "xlm-roberta-large": 2.24,
877
  }
878
 
879
  MODELS_TO_SKIP = {
@@ -997,6 +999,7 @@ MODELS_TO_SKIP = {
997
  "beademiguelperez/sentence-transformers-multilingual-e5-small",
998
  "arcdev/SFR-Embedding-Mistral",
999
  "arcdev/e5-mistral-7b-instruct",
 
1000
  }
1001
 
1002
  def add_lang(examples):
@@ -1079,36 +1082,7 @@ def get_dim_seq_size(model):
1079
  dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
1080
  seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
1081
  # Get model file size without downloading
1082
- if "pytorch_model.bin" in filenames:
1083
- url = hf_hub_url(model.modelId, filename="pytorch_model.bin")
1084
- meta = get_hf_file_metadata(url)
1085
- size = round(meta.size / 1e9, 2)
1086
- elif "pytorch_model.bin.index.json" in filenames:
1087
- index_path = hf_hub_download(model.modelId, filename="pytorch_model.bin.index.json")
1088
- """
1089
- {
1090
- "metadata": {
1091
- "total_size": 28272820224
1092
- },....
1093
- """
1094
- size = json.load(open(index_path))
1095
- if ("metadata" in size) and ("total_size" in size["metadata"]):
1096
- size = round(size["metadata"]["total_size"] / 1e9, 2)
1097
- elif "model.safetensors" in filenames:
1098
- url = hf_hub_url(model.modelId, filename="model.safetensors")
1099
- meta = get_hf_file_metadata(url)
1100
- size = round(meta.size / 1e9, 2)
1101
- elif "model.safetensors.index.json" in filenames:
1102
- index_path = hf_hub_download(model.modelId, filename="model.safetensors.index.json")
1103
- """
1104
- {
1105
- "metadata": {
1106
- "total_size": 14483464192
1107
- },....
1108
- """
1109
- size = json.load(open(index_path))
1110
- if ("metadata" in size) and ("total_size" in size["metadata"]):
1111
- size = round(size["metadata"]["total_size"] / 1e9, 2)
1112
  return dim, seq, size
1113
 
1114
  def make_datasets_clickable(df):
@@ -1120,7 +1094,7 @@ def make_datasets_clickable(df):
1120
  return df
1121
 
1122
  def add_rank(df):
1123
- cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens"]]
1124
  if len(cols_to_rank) == 1:
1125
  df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
1126
  else:
@@ -1150,7 +1124,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
1150
  # Model & at least one result
1151
  if len(res) > 1:
1152
  if add_emb_dim:
1153
- res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
1154
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
1155
  res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
1156
  df_list.append(res)
@@ -1191,7 +1165,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
1191
  if add_emb_dim:
1192
  try:
1193
  # Fails on gated repos, so we only include scores for them
1194
- out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (GB)"] = get_dim_seq_size(model)
1195
  except:
1196
  pass
1197
  df_list.append(out)
@@ -1268,7 +1242,7 @@ def get_mteb_average():
1268
  # Fill NaN after averaging
1269
  DATA_OVERALL.fillna("", inplace=True)
1270
 
1271
- DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
1272
  DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
1273
 
1274
  return DATA_OVERALL
@@ -1327,7 +1301,7 @@ def get_mteb_average_zh():
1327
  # Fill NaN after averaging
1328
  DATA_OVERALL_ZH.fillna("", inplace=True)
1329
 
1330
- DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]]
1331
  DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
1332
 
1333
  return DATA_OVERALL_ZH
@@ -1389,7 +1363,7 @@ def get_mteb_average_fr():
1389
  # Fill NaN after averaging
1390
  DATA_OVERALL_FR.fillna("", inplace=True)
1391
 
1392
- DATA_OVERALL_FR = DATA_OVERALL_FR[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_FR)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_FR)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_FR)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_FR)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_FR)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_FR)} datasets)", f"STS Average ({len(TASK_LIST_STS_FR)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION_FR)} dataset)"]]
1393
  DATA_OVERALL_FR = DATA_OVERALL_FR[DATA_OVERALL_FR.iloc[:, 5:].ne("").any(axis=1)]
1394
 
1395
  return DATA_OVERALL_FR
@@ -1443,7 +1417,7 @@ def get_mteb_average_pl():
1443
  # Fill NaN after averaging
1444
  DATA_OVERALL_PL.fillna("", inplace=True)
1445
 
1446
- DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
1447
  DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
1448
 
1449
  return DATA_OVERALL_PL
 
4
 
5
  from datasets import load_dataset
6
  import gradio as gr
7
+ from huggingface_hub import HfApi, hf_hub_download
8
  from huggingface_hub.repocard import metadata_load
9
  import pandas as pd
10
  from tqdm.autonotebook import tqdm
11
 
12
+ from utils.model_size import get_model_size
13
+
14
  TASKS = [
15
  "BitextMining",
16
  "Classification",
 
788
  }
789
 
790
  EXTERNAL_MODEL_TO_SIZE = {
791
+ "allenai-specter": 110,
792
+ "all-MiniLM-L12-v2": 33,
793
+ "all-MiniLM-L6-v2": 23,
794
+ "all-mpnet-base-v2": 110,
795
+ "bert-base-10lang-cased": 138,
796
+ "bert-base-15lang-cased": 138,
797
+ "bert-base-25lang-cased": 138,
798
+ "bert-base-multilingual-cased": 179,
799
+ "bert-base-multilingual-uncased": 168,
800
+ "bert-base-uncased": 110,
801
+ "bert-base-swedish-cased": 125,
802
+ "bge-base-zh-v1.5": 102,
803
+ "bge-large-zh-v1.5": 326,
804
+ "bge-large-zh-noinstruct": 326,
805
+ "bge-small-zh-v1.5": 24,
806
+ "camembert-base": 111,
807
+ "camembert-large": 338,
808
+ "cross-en-de-roberta-sentence-transformer": 278,
809
+ "contriever-base-msmarco": 110,
810
+ "distilbert-base-25lang-cased": 110,
811
+ "distilbert-base-en-fr-cased": 110,
812
+ "distilbert-base-en-fr-es-pt-it-cased": 110,
813
+ "distilbert-base-fr-cased": 110,
814
+ "distilbert-base-uncased": 110,
815
+ "DanskBERT": 125,
816
+ "distiluse-base-multilingual-cased-v2": 135,
817
+ "dfm-encoder-large-v1": 355,
818
+ "dfm-sentence-encoder-large-1": 355,
819
+ "e5-base": 110,
820
+ "e5-large": 335,
821
+ "e5-mistral-7b-instruct": 7110,
822
+ "e5-small": 33,
823
+ "electra-small-nordic": 23,
824
+ "electra-small-swedish-cased-discriminator": 16,
825
+ "flaubert_base_cased": 138,
826
+ "flaubert_base_uncased": 138,
827
+ "flaubert_large_cased": 372,
828
+ "gbert-base": 110,
829
+ "gbert-large": 337,
830
+ "gelectra-base": 110,
831
+ "gelectra-large": 335,
832
+ "glove.6B.300d": 120,
833
+ "gottbert-base": 127,
834
+ "gtr-t5-base": 110,
835
+ "gtr-t5-large": 168,
836
+ "gtr-t5-xl": 1240,
837
+ "gtr-t5-xxl": 4865,
838
+ "herbert-base-retrieval-v2": 125,
839
+ "komninos": 134,
840
+ "luotuo-bert-medium": 328,
841
+ "LASER2": 43,
842
+ "LaBSE": 471,
843
+ "m3e-base": 102,
844
+ "m3e-large": 102,
845
+ "msmarco-bert-co-condensor": 110,
846
+ "multi-qa-MiniLM-L6-cos-v1": 23,
847
+ "multilingual-e5-base": 278,
848
+ "multilingual-e5-small": 118,
849
+ "multilingual-e5-large": 560,
850
+ "nb-bert-base": 179,
851
+ "nb-bert-large": 355,
852
+ "nomic-embed-text-v1.5-64": 138,
853
+ "nomic-embed-text-v1.5-128": 138,
854
+ "nomic-embed-text-v1.5-256": 138,
855
+ "nomic-embed-text-v1.5-512": 138,
856
+ "norbert3-base": 131,
857
+ "norbert3-large": 368,
858
+ "paraphrase-multilingual-mpnet-base-v2": 278,
859
+ "paraphrase-multilingual-MiniLM-L12-v2": 118,
860
+ "sentence-camembert-base": 110,
861
+ "sentence-camembert-large": 337,
862
+ "sentence-croissant-llm-base": 1280,
863
+ "sentence-bert-swedish-cased": 125,
864
+ "sentence-t5-base": 110,
865
+ "sentence-t5-large": 168,
866
+ "sentence-t5-xl": 1240,
867
+ "sentence-t5-xxl": 4865,
868
+ "silver-retriever-base-v1": 125,
869
+ "sup-simcse-bert-base-uncased": 110,
870
+ "st-polish-paraphrase-from-distilroberta": 125,
871
+ "st-polish-paraphrase-from-mpnet": 125,
872
+ "text2vec-base-chinese": 102,
873
+ "text2vec-large-chinese": 326,
874
+ "unsup-simcse-bert-base-uncased": 110,
875
+ "use-cmlm-multilingual": 472,
876
+ "voyage-lite-02-instruct": 613,
877
+ "xlm-roberta-base": 279,
878
+ "xlm-roberta-large": 560,
879
  }
880
 
881
  MODELS_TO_SKIP = {
 
999
  "beademiguelperez/sentence-transformers-multilingual-e5-small",
1000
  "arcdev/SFR-Embedding-Mistral",
1001
  "arcdev/e5-mistral-7b-instruct",
1002
+ "Koat/gte-tiny",
1003
  }
1004
 
1005
  def add_lang(examples):
 
1082
  dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
1083
  seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
1084
  # Get model file size without downloading
1085
+ size = get_model_size(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1086
  return dim, seq, size
1087
 
1088
  def make_datasets_clickable(df):
 
1094
  return df
1095
 
1096
  def add_rank(df):
1097
+ cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Million Parameters)", "Embedding Dimensions", "Max Tokens"]]
1098
  if len(cols_to_rank) == 1:
1099
  df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
1100
  else:
 
1124
  # Model & at least one result
1125
  if len(res) > 1:
1126
  if add_emb_dim:
1127
+ res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
1128
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
1129
  res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
1130
  df_list.append(res)
 
1165
  if add_emb_dim:
1166
  try:
1167
  # Fails on gated repos, so we only include scores for them
1168
+ out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"] = get_dim_seq_size(model)
1169
  except:
1170
  pass
1171
  df_list.append(out)
 
1242
  # Fill NaN after averaging
1243
  DATA_OVERALL.fillna("", inplace=True)
1244
 
1245
+ DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (Million Parameters)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
1246
  DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
1247
 
1248
  return DATA_OVERALL
 
1301
  # Fill NaN after averaging
1302
  DATA_OVERALL_ZH.fillna("", inplace=True)
1303
 
1304
+ DATA_OVERALL_ZH = DATA_OVERALL_ZH[["Rank", "Model", "Model Size (Million Parameters)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_ZH)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_ZH)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_ZH)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_ZH)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_ZH)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_ZH)} datasets)", f"STS Average ({len(TASK_LIST_STS_ZH)} datasets)"]]
1305
  DATA_OVERALL_ZH = DATA_OVERALL_ZH[DATA_OVERALL_ZH.iloc[:, 5:].ne("").any(axis=1)]
1306
 
1307
  return DATA_OVERALL_ZH
 
1363
  # Fill NaN after averaging
1364
  DATA_OVERALL_FR.fillna("", inplace=True)
1365
 
1366
+ DATA_OVERALL_FR = DATA_OVERALL_FR[["Rank", "Model", "Model Size (Million Parameters)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_FR)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_FR)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_FR)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_FR)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING_FR)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_FR)} datasets)", f"STS Average ({len(TASK_LIST_STS_FR)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION_FR)} dataset)"]]
1367
  DATA_OVERALL_FR = DATA_OVERALL_FR[DATA_OVERALL_FR.iloc[:, 5:].ne("").any(axis=1)]
1368
 
1369
  return DATA_OVERALL_FR
 
1417
  # Fill NaN after averaging
1418
  DATA_OVERALL_PL.fillna("", inplace=True)
1419
 
1420
+ DATA_OVERALL_PL = DATA_OVERALL_PL[["Rank", "Model", "Model Size (Million Parameters)", "Embedding Dimensions", "Max Tokens", f"Average ({len(TASK_LIST_PL)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION_PL)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING_PL)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION_PL)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL_PL)} datasets)", f"STS Average ({len(TASK_LIST_STS_PL)} datasets)"]]
1421
  DATA_OVERALL_PL = DATA_OVERALL_PL[DATA_OVERALL_PL.iloc[:, 5:].ne("").any(axis=1)]
1422
 
1423
  return DATA_OVERALL_PL
utils/__init__.py ADDED
File without changes
utils/model_size.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata, model_info as get_model_info, get_hf_file_metadata, hf_hub_url
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ # Map model IDs to the number of bytes used for one parameter. So, 4 bytes for fp32, 2 bytes for fp16, etc.
7
+ # By default, we assume that the model is stored in fp32.
8
+ KNOWN_BYTES_PER_PARAM = {}
9
+
10
+
11
+ def get_model_size(model_info: ModelInfo):
12
+ '''Get the size of the model in million of parameters.'''
13
+ try:
14
+ safetensors = get_safetensors_metadata(model_info.id)
15
+ return round(sum(safetensors.parameter_count.values()) / 1e6)
16
+ except Exception as e:
17
+ pass
18
+
19
+ filenames = [sib.rfilename for sib in model_info.siblings]
20
+ if "pytorch_model.bin" in filenames:
21
+ url = hf_hub_url(model_info.id, filename="pytorch_model.bin")
22
+ meta = get_hf_file_metadata(url)
23
+ bytes_per_param = KNOWN_BYTES_PER_PARAM.get(model_info.id, 4)
24
+ return round(meta.size / bytes_per_param / 1e6)
25
+
26
+ if "pytorch_model.bin.index.json" in filenames:
27
+ index_path = hf_hub_download(model_info.id, filename="pytorch_model.bin.index.json")
28
+ """
29
+ {
30
+ "metadata": {
31
+ "total_size": 28272820224
32
+ },....
33
+ """
34
+ size = json.load(open(index_path))
35
+ bytes_per_param = KNOWN_BYTES_PER_PARAM.get(model_info.id, 4)
36
+ if ("metadata" in size) and ("total_size" in size["metadata"]):
37
+ return round(size["metadata"]["total_size"] / bytes_per_param / 1e6)
38
+
39
+ return None