File size: 2,756 Bytes
cbf1ab5
 
 
0954803
cbf1ab5
0954803
e5c7cff
 
0954803
e5c7cff
0954803
 
 
 
 
 
e5c7cff
0954803
 
 
a69ad7a
 
0954803
 
 
cbf1ab5
c81a29a
0954803
cbf1ab5
0954803
 
 
 
 
cbf1ab5
0954803
 
 
a69ad7a
 
0954803
78f3320
0954803
78f3320
 
0954803
78f3320
0954803
 
 
 
 
 
78f3320
0954803
 
a69ad7a
 
0954803
78f3320
0954803
78f3320
 
0954803
78f3320
0954803
 
 
 
 
 
78f3320
0954803
 
 
a69ad7a
 
0954803
78f3320
0954803
78f3320
 
0954803
78f3320
0954803
 
 
 
 
 
78f3320
0954803
 
 
a69ad7a
 
0954803
78f3320
0954803
78f3320
 
0954803
78f3320
0954803
 
 
 
 
 
78f3320
0954803
 
 
a69ad7a
 
0954803
78f3320
 
e5c7cff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
---
license: apache-2.0
model-index:
- name: metadata-test
  results:
  # AI2 Reasoning Challenge (25-Shot)
  - task: 
      type: text-generation
      name: Text Generation
    dataset:
      name: AI2 Reasoning Challenge (25-Shot)
      type: ai2_arc
      config: ARC-Challenge
      split: test
      args:
        num_few_shot: 25
    metrics:
       - type: acc_norm
         name: normalized accuracy
         value: 0.6203071672354948
    source:
      name: Open LLM Leaderboard
      url: https://huggingface.co/datasets/open-llm-leaderboard/details_
      
  # HellaSwag (10-shot)
  - task: 
      type: text-generation
      name: Text Generation
    dataset:
      name: HellaSwag (10-Shot)
      type: hellaswag
      split: validation
      args:
        num_few_shot: 10
    metrics:
       - type: acc_norm
         name: normalized accuracy
         value: 0.8435570603465445
    source:
      name: Open LLM Leaderboard
      url: https://huggingface.co/datasets/open-llm-leaderboard/details_

  # TruthfulQA (0-shot)
  - task: 
      type: text-generation
      name: Text Generation
    dataset:
      name: TruthfulQA (0-shot)
      type: truthful_qa
      config: multiple_choice
      split: validation
      args:
        num_few_shot: 0
    metrics:
       - type: mc2
         value: 0.5744916942762855
    source:
      name: Open LLM Leaderboard
      url: https://huggingface.co/datasets/open-llm-leaderboard/details_

  # GSM8k (5-shot)
  - task: 
      type: text-generation
      name: Text Generation
    dataset:
      name: GSM8k (5-shot)
      type: gsm8k
      config: main
      split: test
      args:
        num_few_shot: 5
    metrics:
       - type: acc
         name: accuracy
         value: 0.12736921910538287
    source:
      name: Open LLM Leaderboard
      url: https://huggingface.co/datasets/open-llm-leaderboard/details_

  # MMLU (5-Shot)
  - task: 
      type: text-generation
      name: Text Generation
    dataset:
      name: MMLU (5-Shot)
      type: cais/mmlu
      config: all
      split: test
      args:
        num_few_shot: 5
    metrics:
       - type: acc
         name: accuracy
         value: 0.6107
    source:
      name: Open LLM Leaderboard
      url: https://huggingface.co/datasets/open-llm-leaderboard/details_

  # Winogrande (5-shot)
  - task: 
      type: text-generation
      name: Text Generation
    dataset:
      name: Winogrande (5-shot)
      type: winogrande
      config: winogrande_xl
      split: validation
      args:
        num_few_shot: 5
    metrics:
       - type: acc
         name: accuracy
         value: 0.7774269928966061
    source:
      name: Open LLM Leaderboard
      url: https://huggingface.co/datasets/open-llm-leaderboard/details_


---