nm-research commited on
Commit
249ba2c
·
verified ·
1 Parent(s): 82c3646

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +42 -42
README.md CHANGED
@@ -198,88 +198,88 @@ lm_eval \
198
  <tr>
199
  <td rowspan="7"><b>OpenLLM V1</b></td>
200
  <td>arc_challenge</td>
201
- <td>60.24</td>
202
- <td>59.30</td>
203
- <td>98.44%</td>
204
  </tr>
205
  <tr>
206
  <td>gsm8k</td>
207
- <td>60.12</td>
208
- <td>65.13</td>
209
- <td>108.34%</td>
210
  </tr>
211
  <tr>
212
  <td>hellaswag</td>
213
- <td>74.94</td>
214
- <td>73.31</td>
215
- <td>97.82%</td>
216
  </tr>
217
  <tr>
218
  <td>mmlu</td>
219
- <td>64.14</td>
220
- <td>63.08</td>
221
- <td>98.35%</td>
222
  </tr>
223
  <tr>
224
  <td>truthfulqa_mc2</td>
225
- <td>54.87</td>
226
- <td>54.31</td>
227
- <td>99.00%</td>
228
  </tr>
229
  <tr>
230
  <td>winogrande</td>
231
- <td>68.35</td>
232
- <td>66.77</td>
233
- <td>97.68%</td>
234
  </tr>
235
  <tr>
236
  <td><b>Average</b></td>
237
- <td>63.78</td>
238
- <td>63.65</td>
239
- <td><b>99.80%</b></td>
240
  </tr>
241
  <tr>
242
  <td rowspan="7"><b>Leaderboard</b></td>
243
  <td>bbh</td>
244
- <td>55.46</td>
245
- <td>54.89</td>
246
- <td>98.97%</td>
247
  </tr>
248
  <tr>
249
  <td>mmlu_pro</td>
250
- <td>34.38</td>
251
- <td>32.05</td>
252
- <td>93.23%</td>
253
  </tr>
254
  <tr>
255
  <td>musr</td>
256
- <td>33.20</td>
257
- <td>34.66</td>
258
- <td>104.40%</td>
259
  </tr>
260
  <tr>
261
  <td>ifeval</td>
262
- <td>84.41</td>
263
- <td>81.65</td>
264
- <td>96.73%</td>
265
  </tr>
266
  <tr>
267
  <td>gpqa</td>
268
- <td>30.87</td>
269
- <td>28.69</td>
270
- <td>92.95%</td>
271
  </tr>
272
  <tr>
273
  <td>math_hard</td>
274
- <td>45.54</td>
275
- <td>39.95</td>
276
- <td>87.72%</td>
277
  </tr>
278
  <tr>
279
  <td><b>Average</b></td>
280
- <td>47.31</td>
281
- <td>45.32</td>
282
- <td><b>95.78%</b></td>
283
  </tr>
284
  </tbody>
285
  </table>
 
198
  <tr>
199
  <td rowspan="7"><b>OpenLLM V1</b></td>
200
  <td>arc_challenge</td>
201
+ <td>50.60</td>
202
+ <td>47.35</td>
203
+ <td>93.57%</td>
204
  </tr>
205
  <tr>
206
  <td>gsm8k</td>
207
+ <td>48.07</td>
208
+ <td>24.34</td>
209
+ <td>50.65%</td>
210
  </tr>
211
  <tr>
212
  <td>hellaswag</td>
213
+ <td>67.78</td>
214
+ <td>64.89</td>
215
+ <td>95.74%</td>
216
  </tr>
217
  <tr>
218
  <td>mmlu</td>
219
+ <td>59.92</td>
220
+ <td>57.81</td>
221
+ <td>96.48%</td>
222
  </tr>
223
  <tr>
224
  <td>truthfulqa_mc2</td>
225
+ <td>49.98</td>
226
+ <td>49.02</td>
227
+ <td>98.08%</td>
228
  </tr>
229
  <tr>
230
  <td>winogrande</td>
231
+ <td>65.11</td>
232
+ <td>63.61</td>
233
+ <td>97.70%</td>
234
  </tr>
235
  <tr>
236
  <td><b>Average</b></td>
237
+ <td>56.91</td>
238
+ <td>51.17</td>
239
+ <td><b>89.91%</b></td>
240
  </tr>
241
  <tr>
242
  <td rowspan="7"><b>Leaderboard</b></td>
243
  <td>bbh</td>
244
+ <td>53.32</td>
245
+ <td>51.35</td>
246
+ <td>96.30%</td>
247
  </tr>
248
  <tr>
249
  <td>mmlu_pro</td>
250
+ <td>29.76</td>
251
+ <td>27.13</td>
252
+ <td>91.12%</td>
253
  </tr>
254
  <tr>
255
  <td>musr</td>
256
+ <td>34.52</td>
257
+ <td>37.83</td>
258
+ <td>109.59%</td>
259
  </tr>
260
  <tr>
261
  <td>ifeval</td>
262
+ <td>80.22</td>
263
+ <td>78.30</td>
264
+ <td>97.60%</td>
265
  </tr>
266
  <tr>
267
  <td>gpqa</td>
268
+ <td>30.54</td>
269
+ <td>30.45</td>
270
+ <td>99.70%</td>
271
  </tr>
272
  <tr>
273
  <td>math_hard</td>
274
+ <td>34.52</td>
275
+ <td>23.41</td>
276
+ <td>67.83%</td>
277
  </tr>
278
  <tr>
279
  <td><b>Average</b></td>
280
+ <td>43.81</td>
281
+ <td>41.41</td>
282
+ <td><b>94.52%</b></td>
283
  </tr>
284
  </tbody>
285
  </table>