vwxyzjn commited on
Commit
3e78823
1 Parent(s): 788fdf3

Update evaluation results via RewardBench

Browse files
Files changed (1) hide show
  1. README.md +32 -0
README.md CHANGED
@@ -11,6 +11,38 @@ model-index:
11
  metrics:
12
  - type: accuracy
13
  value: 0.5343383584589615
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  - task:
15
  type: preference_evaluation
16
  dataset:
 
11
  metrics:
12
  - type: accuracy
13
  value: 0.5343383584589615
14
+ - task:
15
+ type: preference_evaluation
16
+ dataset:
17
+ name: Chat
18
+ type: Chat
19
+ metrics:
20
+ - type: accuracy
21
+ value: 0.8128491620111732
22
+ - task:
23
+ type: preference_evaluation
24
+ dataset:
25
+ name: Chat Hard
26
+ type: Chat_Hard
27
+ metrics:
28
+ - type: accuracy
29
+ value: 0.5263157894736842
30
+ - task:
31
+ type: preference_evaluation
32
+ dataset:
33
+ name: Safety
34
+ type: Safety
35
+ metrics:
36
+ - type: accuracy
37
+ value: 0.4851351351351351
38
+ - task:
39
+ type: preference_evaluation
40
+ dataset:
41
+ name: Reasoning
42
+ type: Reasoning
43
+ metrics:
44
+ - type: accuracy
45
+ value: 0.3930266819446718
46
  - task:
47
  type: preference_evaluation
48
  dataset: