Spaces:

latticeflow
/

compl-ai-board

Running

App Files Files Community

pavol-bielik commited on Dec 2, 2024

Commit

3e9b40f

1 Parent(s): d793c0c

add new benchmarks

Browse files

Files changed (4) hide show

results/mistralai/Mistral-7B-Instruct-v0.2.json +20 -23
results/mistralai/Mistral-7B-Instruct-v0.3.json +84 -0
results/mistralai/Mistral-7B-v0.3.json +84 -0
results/speakleash/Bielik-11B-v2.3-Instruct.json +84 -0

results/mistralai/Mistral-7B-Instruct-v0.2.json CHANGED Viewed

@@ -6,58 +6,58 @@
     },
     "results": {
         "bbq": {
-            "aggregate_score": 0.9040460470649646
         },
         "bold": {
-            "aggregate_score": 0.747184860969805
         },
         "toxicity": {
-            "aggregate_score": 0.9607862783311598
         },
         "toxicity_advbench": {
-            "aggregate_score": 0.9940165055485872
         },
         "forecasting_consistency": {
-            "aggregate_score": 0.5797872340425532
         },
         "self_check_consistency": {
-            "aggregate_score": 0.6974789915966386
         },
         "boolq_contrast_robustness": {
-            "aggregate_score": 0.2833333333333333
         },
         "imdb_contrast_robustness": {
-            "aggregate_score": 0.12
         },
         "calibration_big_bench": {
-            "aggregate_score": 0.93375
         },
         "calibration_big_bench_i_know": {
-            "aggregate_score": 0.6862379582134368
         },
         "decoding_trust": {
             "aggregate_score": 1.0
         },
         "hellaswag": {
-            "aggregate_score": 0.8477394941246763
         },
         "human_eval": {
-            "aggregate_score": 0.4018633540372671
         },
         "instruction_goal_hijacking": {
-            "aggregate_score": 0.3118556701030928
         },
         "multiturn_goal_hijacking": {
-            "aggregate_score": 0.23205659075224297
         },
         "reddit_bias": {
-            "aggregate_score": 0.659153609956636
         },
         "truthful_qa_mc2": {
-            "aggregate_score": 0.6681153383230657
         },
         "mmlu": {
-            "aggregate_score": 0.5910838911835921
         },
         "ai2_reasoning": {
             "aggregate_score": 0.6407849829351536
@@ -66,22 +66,19 @@
             "aggregate_score": 0.9863013698630136
         },
         "memorization": {
-            "aggregate_score": 0.988
         },
         "privacy": {
             "aggregate_score": 1.0
         },
         "fairllm": {
-            "aggregate_score": 0.019112659939765738
         },
         "mmlu_robustness": {
-            "aggregate_score": 0.5776363636363636
         },
         "training_data_suitability": {
             "aggregate_score": null
-        },
-        "watermarking": {
-            "aggregate_score": null
         }
     }
 }

     },
     "results": {
         "bbq": {
+            "aggregate_score": 0.9879410240708465
         },
         "bold": {
+            "aggregate_score": 0.7168366652858247
         },
         "toxicity": {
+            "aggregate_score": 0.9647395036825757
         },
         "toxicity_advbench": {
+            "aggregate_score": 0.9924029700235294
         },
         "forecasting_consistency": {
+            "aggregate_score": 0.6159090909090909
         },
         "self_check_consistency": {
+            "aggregate_score": 0.7
         },
         "boolq_contrast_robustness": {
+            "aggregate_score": 0.43333333333333335
         },
         "imdb_contrast_robustness": {
+            "aggregate_score": 0.52
         },
         "calibration_big_bench": {
+            "aggregate_score": 0.66
         },
         "calibration_big_bench_i_know": {
+            "aggregate_score": 0.6866007756787189
         },
         "decoding_trust": {
             "aggregate_score": 1.0
         },
         "hellaswag": {
+            "aggregate_score": 0.8476399123680541
         },
         "human_eval": {
+            "aggregate_score": 0.34658385093167693
         },
         "instruction_goal_hijacking": {
+            "aggregate_score": 0.32474226804123707
         },
         "multiturn_goal_hijacking": {
+            "aggregate_score": 0.1501150218541523
         },
         "reddit_bias": {
+            "aggregate_score": 0.6851116558688943
         },
         "truthful_qa_mc2": {
+            "aggregate_score": 0.6682102991684862
         },
         "mmlu": {
+            "aggregate_score": 0.5891610881640792
         },
         "ai2_reasoning": {
             "aggregate_score": 0.6407849829351536
             "aggregate_score": 0.9863013698630136
         },
         "memorization": {
+            "aggregate_score": 0.987
         },
         "privacy": {
             "aggregate_score": 1.0
         },
         "fairllm": {
+            "aggregate_score": 0.024988622060130607
         },
         "mmlu_robustness": {
+            "aggregate_score": 0.5747272727272728
         },
         "training_data_suitability": {
             "aggregate_score": null
         }
     }
 }

results/mistralai/Mistral-7B-Instruct-v0.3.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+    "config": {
+        "model_name": "mistralai/Mistral-7B-Instruct-v0.3",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.3"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9044152218336122
+        },
+        "bold": {
+            "aggregate_score": 0.7108740914482355
+        },
+        "toxicity": {
+            "aggregate_score": 0.9703781714275141
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9959697062946641
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6466666666666666
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.6825842696629214
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.5
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.52
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.92625
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.716176654572751
+        },
+        "decoding_trust": {
+            "aggregate_score": 1.0
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8477394941246763
+        },
+        "human_eval": {
+            "aggregate_score": 0.40931677018633544
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.27963917525773196
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.19447895100069013
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.5169315725744212
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.5969331715581344
+        },
+        "mmlu": {
+            "aggregate_score": 0.6138014527845036
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6399317406143344
+        },
+        "human_deception": {
+            "aggregate_score": 1.0
+        },
+        "memorization": {
+            "aggregate_score": 0.989
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.06456819064936324
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.584
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        }
+    }
+}

results/mistralai/Mistral-7B-v0.3.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+    "config": {
+        "model_name": "mistralai/Mistral-7B-v0.3",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Mistral-7B-v0.3"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9564920273573378
+        },
+        "bold": {
+            "aggregate_score": 0.7429509360724185
+        },
+        "toxicity": {
+            "aggregate_score": 0.8792196644341985
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9759773845315347
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.5882978723404255
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.7597173144876326
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.5833333333333334
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.52
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.9275
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.5337044914300013
+        },
+        "decoding_trust": {
+            "aggregate_score": null
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8307110137422824
+        },
+        "human_eval": {
+            "aggregate_score": 0.3024844720496895
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.5154639175257731
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.2931331953071083
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.5081097088728108
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.4260832948366681
+        },
+        "mmlu": {
+            "aggregate_score": 0.6257655604614727
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6092150170648464
+        },
+        "human_deception": {
+            "aggregate_score": 0.9452054794520548
+        },
+        "memorization": {
+            "aggregate_score": 0.989
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.0
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.5923636363636363
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        }
+    }
+}

results/speakleash/Bielik-11B-v2.3-Instruct.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+    "config": {
+        "model_name": "speakleash/Bielik-11B-v2.3-Instruct",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/SpeakLeash-Bielik-11B-v2.3-Instruct"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9112192639077836
+        },
+        "bold": {
+            "aggregate_score": 0.7290602998476224
+        },
+        "toxicity": {
+            "aggregate_score": 0.9365256772744497
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9904272396950267
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6228260869565219
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.6639344262295082
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.8166666666666667
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.85
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.95125
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.5000125109470787
+        },
+        "decoding_trust": {
+            "aggregate_score": 1.0
+        },
+        "hellaswag": {
+            "aggregate_score": 0.7999402509460267
+        },
+        "human_eval": {
+            "aggregate_score": 0.5863354037267081
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.43041237113402064
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.3431216931216931
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.8305070286267637
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.5735697511313619
+        },
+        "mmlu": {
+            "aggregate_score": 0.6467027488961686
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6032423208191127
+        },
+        "human_deception": {
+            "aggregate_score": 0.6712328767123288
+        },
+        "memorization": {
+            "aggregate_score": 0.99
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.0026849217638691323
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.6194545454545455
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        }
+    }
+}