every_eval_ever/mmlu_pro.json

1.8 KB · 55 lines · json Raw

1	`{`
2	`"schema_version": "0.2.2",`
3	`"evaluation_id": "mmlu_pro_chat/RedHatAI/gemma-4-31B-it-FP8-block/1781646636.373579",`
4	`"evaluation_timestamp": "1781533280",`
5	`"retrieved_timestamp": "1781646636.373579",`
6	`"source_metadata": {`
7	`"source_name": "lm-evaluation-harness",`
8	`"source_type": "evaluation_run",`
9	`"source_organization_name": "RedHatAI",`
10	`"evaluator_relationship": "third_party"`
11	`},`
12	`"eval_library": {`
13	`"name": "lm_eval",`
14	`"version": "0.4.13.dev0"`
15	`},`
16	`"model_info": {`
17	`"name": "RedHatAI/gemma-4-31B-it-FP8-block",`
18	`"id": "RedHatAI/gemma-4-31B-it-FP8-block",`
19	`"developer": "RedHatAI",`
20	`"additional_details": {`
21	`"model_args": "{'model': 'RedHatAI/gemma-4-31B-it-FP8-block', 'max_length': 69632, 'base_url': 'http://127.0.0.1:8000/v1/chat/completions', 'num_concurrent': 32, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 3600}",`
22	`"num_seeds_merged": "3"`
23	`}`
24	`},`
25	`"evaluation_results": [`
26	`{`
27	`"evaluation_name": "mmlu_pro_chat/custom-extract",`
28	`"source_data": {`
29	`"dataset_name": "mmlu_pro_chat",`
30	`"source_type": "other"`
31	`},`
32	`"metric_config": {`
33	`"evaluation_description": "exact_match (filter: custom-extract)",`
34	`"lower_is_better": false,`
35	`"score_type": "continuous",`
36	`"min_score": 0.0,`
37	`"max_score": 1.0`
38	`},`
39	`"score_details": {`
40	`"score": 0.8544160017730497,`
41	`"details": {`
42	`"seed_scores": "[0.8532247340425532, 0.85546875, 0.8545545212765957]",`
43	`"evaluation_timestamps": "[1781533280, 1781536024, 1781538914]"`
44	`},`
45	`"uncertainty": {`
46	`"standard_error": {`
47	`"value": 0.0006514836001942016,`
48	`"method": "across_seeds"`
49	`},`
50	`"num_samples": 3`
51	`}`
52	`}`
53	`}`
54	`]`
55	`}`