.eval_results/mdpbench.yaml
4.7 KB · 199 lines · yaml Raw
1 - dataset:
2 id: Delores-Lin/MDPBench
3 task_id: overall
4 value: 67.3
5 date: "2026-04-14"
6 source:
7 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
8 name: MDPBench leaderboard
9 user: Delores-Lin
10 - dataset:
11 id: Delores-Lin/MDPBench
12 task_id: digital
13 value: 77.9
14 date: "2026-04-14"
15 source:
16 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
17 name: MDPBench leaderboard
18 user: Delores-Lin
19 - dataset:
20 id: Delores-Lin/MDPBench
21 task_id: photographed
22 value: 63.7
23 date: "2026-04-14"
24 source:
25 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
26 name: MDPBench leaderboard
27 user: Delores-Lin
28 - dataset:
29 id: Delores-Lin/MDPBench
30 task_id: latin
31 value: 78.7
32 date: "2026-04-14"
33 source:
34 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
35 name: MDPBench leaderboard
36 user: Delores-Lin
37 - dataset:
38 id: Delores-Lin/MDPBench
39 task_id: de
40 value: 82.7
41 date: "2026-04-14"
42 source:
43 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
44 name: MDPBench leaderboard
45 user: Delores-Lin
46 - dataset:
47 id: Delores-Lin/MDPBench
48 task_id: en
49 value: 84.5
50 date: "2026-04-14"
51 source:
52 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
53 name: MDPBench leaderboard
54 user: Delores-Lin
55 - dataset:
56 id: Delores-Lin/MDPBench
57 task_id: es
58 value: 75.8
59 date: "2026-04-14"
60 source:
61 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
62 name: MDPBench leaderboard
63 user: Delores-Lin
64 - dataset:
65 id: Delores-Lin/MDPBench
66 task_id: fr
67 value: 76.2
68 date: "2026-04-14"
69 source:
70 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
71 name: MDPBench leaderboard
72 user: Delores-Lin
73 - dataset:
74 id: Delores-Lin/MDPBench
75 task_id: id
76 value: 79.7
77 date: "2026-04-14"
78 source:
79 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
80 name: MDPBench leaderboard
81 user: Delores-Lin
82 - dataset:
83 id: Delores-Lin/MDPBench
84 task_id: it
85 value: 82.8
86 date: "2026-04-14"
87 source:
88 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
89 name: MDPBench leaderboard
90 user: Delores-Lin
91 - dataset:
92 id: Delores-Lin/MDPBench
93 task_id: nl
94 value: 80.2
95 date: "2026-04-14"
96 source:
97 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
98 name: MDPBench leaderboard
99 user: Delores-Lin
100 - dataset:
101 id: Delores-Lin/MDPBench
102 task_id: pt
103 value: 77.4
104 date: "2026-04-14"
105 source:
106 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
107 name: MDPBench leaderboard
108 user: Delores-Lin
109 - dataset:
110 id: Delores-Lin/MDPBench
111 task_id: vi
112 value: 69.2
113 date: "2026-04-14"
114 source:
115 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
116 name: MDPBench leaderboard
117 user: Delores-Lin
118 - dataset:
119 id: Delores-Lin/MDPBench
120 task_id: non_latin
121 value: 54.3
122 date: "2026-04-14"
123 source:
124 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
125 name: MDPBench leaderboard
126 user: Delores-Lin
127 - dataset:
128 id: Delores-Lin/MDPBench
129 task_id: ar
130 value: 21.7
131 date: "2026-04-14"
132 source:
133 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
134 name: MDPBench leaderboard
135 user: Delores-Lin
136 - dataset:
137 id: Delores-Lin/MDPBench
138 task_id: hi
139 value: 39.6
140 date: "2026-04-14"
141 source:
142 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
143 name: MDPBench leaderboard
144 user: Delores-Lin
145 - dataset:
146 id: Delores-Lin/MDPBench
147 task_id: jp
148 value: 65.5
149 date: "2026-04-14"
150 source:
151 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
152 name: MDPBench leaderboard
153 user: Delores-Lin
154 - dataset:
155 id: Delores-Lin/MDPBench
156 task_id: ko
157 value: 61.2
158 date: "2026-04-14"
159 source:
160 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
161 name: MDPBench leaderboard
162 user: Delores-Lin
163 - dataset:
164 id: Delores-Lin/MDPBench
165 task_id: ru
166 value: 64.2
167 date: "2026-04-14"
168 source:
169 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
170 name: MDPBench leaderboard
171 user: Delores-Lin
172 - dataset:
173 id: Delores-Lin/MDPBench
174 task_id: th
175 value: 27.4
176 date: "2026-04-14"
177 source:
178 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
179 name: MDPBench leaderboard
180 user: Delores-Lin
181 - dataset:
182 id: Delores-Lin/MDPBench
183 task_id: zh
184 value: 78.5
185 date: "2026-04-14"
186 source:
187 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
188 name: MDPBench leaderboard
189 user: Delores-Lin
190 - dataset:
191 id: Delores-Lin/MDPBench
192 task_id: zh_t
193 value: 76.7
194 date: "2026-04-14"
195 source:
196 url: https://huggingface.co/datasets/Delores-Lin/MDPBench
197 name: MDPBench leaderboard
198 user: Delores-Lin
199