pavol-bielik commited on
Commit
3e9b40f
·
1 Parent(s): d793c0c

add new benchmarks

Browse files
results/mistralai/Mistral-7B-Instruct-v0.2.json CHANGED
@@ -6,58 +6,58 @@
6
  },
7
  "results": {
8
  "bbq": {
9
- "aggregate_score": 0.9040460470649646
10
  },
11
  "bold": {
12
- "aggregate_score": 0.747184860969805
13
  },
14
  "toxicity": {
15
- "aggregate_score": 0.9607862783311598
16
  },
17
  "toxicity_advbench": {
18
- "aggregate_score": 0.9940165055485872
19
  },
20
  "forecasting_consistency": {
21
- "aggregate_score": 0.5797872340425532
22
  },
23
  "self_check_consistency": {
24
- "aggregate_score": 0.6974789915966386
25
  },
26
  "boolq_contrast_robustness": {
27
- "aggregate_score": 0.2833333333333333
28
  },
29
  "imdb_contrast_robustness": {
30
- "aggregate_score": 0.12
31
  },
32
  "calibration_big_bench": {
33
- "aggregate_score": 0.93375
34
  },
35
  "calibration_big_bench_i_know": {
36
- "aggregate_score": 0.6862379582134368
37
  },
38
  "decoding_trust": {
39
  "aggregate_score": 1.0
40
  },
41
  "hellaswag": {
42
- "aggregate_score": 0.8477394941246763
43
  },
44
  "human_eval": {
45
- "aggregate_score": 0.4018633540372671
46
  },
47
  "instruction_goal_hijacking": {
48
- "aggregate_score": 0.3118556701030928
49
  },
50
  "multiturn_goal_hijacking": {
51
- "aggregate_score": 0.23205659075224297
52
  },
53
  "reddit_bias": {
54
- "aggregate_score": 0.659153609956636
55
  },
56
  "truthful_qa_mc2": {
57
- "aggregate_score": 0.6681153383230657
58
  },
59
  "mmlu": {
60
- "aggregate_score": 0.5910838911835921
61
  },
62
  "ai2_reasoning": {
63
  "aggregate_score": 0.6407849829351536
@@ -66,22 +66,19 @@
66
  "aggregate_score": 0.9863013698630136
67
  },
68
  "memorization": {
69
- "aggregate_score": 0.988
70
  },
71
  "privacy": {
72
  "aggregate_score": 1.0
73
  },
74
  "fairllm": {
75
- "aggregate_score": 0.019112659939765738
76
  },
77
  "mmlu_robustness": {
78
- "aggregate_score": 0.5776363636363636
79
  },
80
  "training_data_suitability": {
81
  "aggregate_score": null
82
- },
83
- "watermarking": {
84
- "aggregate_score": null
85
  }
86
  }
87
  }
 
6
  },
7
  "results": {
8
  "bbq": {
9
+ "aggregate_score": 0.9879410240708465
10
  },
11
  "bold": {
12
+ "aggregate_score": 0.7168366652858247
13
  },
14
  "toxicity": {
15
+ "aggregate_score": 0.9647395036825757
16
  },
17
  "toxicity_advbench": {
18
+ "aggregate_score": 0.9924029700235294
19
  },
20
  "forecasting_consistency": {
21
+ "aggregate_score": 0.6159090909090909
22
  },
23
  "self_check_consistency": {
24
+ "aggregate_score": 0.7
25
  },
26
  "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.43333333333333335
28
  },
29
  "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.52
31
  },
32
  "calibration_big_bench": {
33
+ "aggregate_score": 0.66
34
  },
35
  "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.6866007756787189
37
  },
38
  "decoding_trust": {
39
  "aggregate_score": 1.0
40
  },
41
  "hellaswag": {
42
+ "aggregate_score": 0.8476399123680541
43
  },
44
  "human_eval": {
45
+ "aggregate_score": 0.34658385093167693
46
  },
47
  "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.32474226804123707
49
  },
50
  "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.1501150218541523
52
  },
53
  "reddit_bias": {
54
+ "aggregate_score": 0.6851116558688943
55
  },
56
  "truthful_qa_mc2": {
57
+ "aggregate_score": 0.6682102991684862
58
  },
59
  "mmlu": {
60
+ "aggregate_score": 0.5891610881640792
61
  },
62
  "ai2_reasoning": {
63
  "aggregate_score": 0.6407849829351536
 
66
  "aggregate_score": 0.9863013698630136
67
  },
68
  "memorization": {
69
+ "aggregate_score": 0.987
70
  },
71
  "privacy": {
72
  "aggregate_score": 1.0
73
  },
74
  "fairllm": {
75
+ "aggregate_score": 0.024988622060130607
76
  },
77
  "mmlu_robustness": {
78
+ "aggregate_score": 0.5747272727272728
79
  },
80
  "training_data_suitability": {
81
  "aggregate_score": null
 
 
 
82
  }
83
  }
84
  }
results/mistralai/Mistral-7B-Instruct-v0.3.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.3",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.3"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9044152218336122
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7108740914482355
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9703781714275141
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9959697062946641
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6466666666666666
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.6825842696629214
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.5
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.52
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.92625
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.716176654572751
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 1.0
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8477394941246763
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.40931677018633544
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.27963917525773196
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.19447895100069013
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.5169315725744212
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.5969331715581344
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.6138014527845036
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6399317406143344
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 1.0
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.989
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.06456819064936324
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.584
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ }
83
+ }
84
+ }
results/mistralai/Mistral-7B-v0.3.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "mistralai/Mistral-7B-v0.3",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/Mistral-7B-v0.3"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9564920273573378
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7429509360724185
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.8792196644341985
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9759773845315347
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.5882978723404255
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.7597173144876326
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.5833333333333334
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.52
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.9275
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.5337044914300013
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": null
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.8307110137422824
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.3024844720496895
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.5154639175257731
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.2931331953071083
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.5081097088728108
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.4260832948366681
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.6257655604614727
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6092150170648464
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.9452054794520548
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.989
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.0
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.5923636363636363
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ }
83
+ }
84
+ }
results/speakleash/Bielik-11B-v2.3-Instruct.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "speakleash/Bielik-11B-v2.3-Instruct",
4
+ "model_sha": "main",
5
+ "model_report": "https://compl-ai.org/evaluations/SpeakLeash-Bielik-11B-v2.3-Instruct"
6
+ },
7
+ "results": {
8
+ "bbq": {
9
+ "aggregate_score": 0.9112192639077836
10
+ },
11
+ "bold": {
12
+ "aggregate_score": 0.7290602998476224
13
+ },
14
+ "toxicity": {
15
+ "aggregate_score": 0.9365256772744497
16
+ },
17
+ "toxicity_advbench": {
18
+ "aggregate_score": 0.9904272396950267
19
+ },
20
+ "forecasting_consistency": {
21
+ "aggregate_score": 0.6228260869565219
22
+ },
23
+ "self_check_consistency": {
24
+ "aggregate_score": 0.6639344262295082
25
+ },
26
+ "boolq_contrast_robustness": {
27
+ "aggregate_score": 0.8166666666666667
28
+ },
29
+ "imdb_contrast_robustness": {
30
+ "aggregate_score": 0.85
31
+ },
32
+ "calibration_big_bench": {
33
+ "aggregate_score": 0.95125
34
+ },
35
+ "calibration_big_bench_i_know": {
36
+ "aggregate_score": 0.5000125109470787
37
+ },
38
+ "decoding_trust": {
39
+ "aggregate_score": 1.0
40
+ },
41
+ "hellaswag": {
42
+ "aggregate_score": 0.7999402509460267
43
+ },
44
+ "human_eval": {
45
+ "aggregate_score": 0.5863354037267081
46
+ },
47
+ "instruction_goal_hijacking": {
48
+ "aggregate_score": 0.43041237113402064
49
+ },
50
+ "multiturn_goal_hijacking": {
51
+ "aggregate_score": 0.3431216931216931
52
+ },
53
+ "reddit_bias": {
54
+ "aggregate_score": 0.8305070286267637
55
+ },
56
+ "truthful_qa_mc2": {
57
+ "aggregate_score": 0.5735697511313619
58
+ },
59
+ "mmlu": {
60
+ "aggregate_score": 0.6467027488961686
61
+ },
62
+ "ai2_reasoning": {
63
+ "aggregate_score": 0.6032423208191127
64
+ },
65
+ "human_deception": {
66
+ "aggregate_score": 0.6712328767123288
67
+ },
68
+ "memorization": {
69
+ "aggregate_score": 0.99
70
+ },
71
+ "privacy": {
72
+ "aggregate_score": 1.0
73
+ },
74
+ "fairllm": {
75
+ "aggregate_score": 0.0026849217638691323
76
+ },
77
+ "mmlu_robustness": {
78
+ "aggregate_score": 0.6194545454545455
79
+ },
80
+ "training_data_suitability": {
81
+ "aggregate_score": null
82
+ }
83
+ }
84
+ }