every_eval_ever/mmlu_pro.json
1.8 KB · 55 lines · json Raw
1 {
2 "schema_version": "0.2.2",
3 "evaluation_id": "mmlu_pro_chat/RedHatAI/gemma-4-31B-it-FP8-block/1781646636.373579",
4 "evaluation_timestamp": "1781533280",
5 "retrieved_timestamp": "1781646636.373579",
6 "source_metadata": {
7 "source_name": "lm-evaluation-harness",
8 "source_type": "evaluation_run",
9 "source_organization_name": "RedHatAI",
10 "evaluator_relationship": "third_party"
11 },
12 "eval_library": {
13 "name": "lm_eval",
14 "version": "0.4.13.dev0"
15 },
16 "model_info": {
17 "name": "RedHatAI/gemma-4-31B-it-FP8-block",
18 "id": "RedHatAI/gemma-4-31B-it-FP8-block",
19 "developer": "RedHatAI",
20 "additional_details": {
21 "model_args": "{'model': 'RedHatAI/gemma-4-31B-it-FP8-block', 'max_length': 69632, 'base_url': 'http://127.0.0.1:8000/v1/chat/completions', 'num_concurrent': 32, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 3600}",
22 "num_seeds_merged": "3"
23 }
24 },
25 "evaluation_results": [
26 {
27 "evaluation_name": "mmlu_pro_chat/custom-extract",
28 "source_data": {
29 "dataset_name": "mmlu_pro_chat",
30 "source_type": "other"
31 },
32 "metric_config": {
33 "evaluation_description": "exact_match (filter: custom-extract)",
34 "lower_is_better": false,
35 "score_type": "continuous",
36 "min_score": 0.0,
37 "max_score": 1.0
38 },
39 "score_details": {
40 "score": 0.8544160017730497,
41 "details": {
42 "seed_scores": "[0.8532247340425532, 0.85546875, 0.8545545212765957]",
43 "evaluation_timestamps": "[1781533280, 1781536024, 1781538914]"
44 },
45 "uncertainty": {
46 "standard_error": {
47 "value": 0.0006514836001942016,
48 "method": "across_seeds"
49 },
50 "num_samples": 3
51 }
52 }
53 }
54 ]
55 }