every_eval_ever/mmlu_pro.json
| 1 | { |
| 2 | "schema_version": "0.2.2", |
| 3 | "evaluation_id": "mmlu_pro_chat/RedHatAI/gemma-4-31B-it-FP8-block/1781646636.373579", |
| 4 | "evaluation_timestamp": "1781533280", |
| 5 | "retrieved_timestamp": "1781646636.373579", |
| 6 | "source_metadata": { |
| 7 | "source_name": "lm-evaluation-harness", |
| 8 | "source_type": "evaluation_run", |
| 9 | "source_organization_name": "RedHatAI", |
| 10 | "evaluator_relationship": "third_party" |
| 11 | }, |
| 12 | "eval_library": { |
| 13 | "name": "lm_eval", |
| 14 | "version": "0.4.13.dev0" |
| 15 | }, |
| 16 | "model_info": { |
| 17 | "name": "RedHatAI/gemma-4-31B-it-FP8-block", |
| 18 | "id": "RedHatAI/gemma-4-31B-it-FP8-block", |
| 19 | "developer": "RedHatAI", |
| 20 | "additional_details": { |
| 21 | "model_args": "{'model': 'RedHatAI/gemma-4-31B-it-FP8-block', 'max_length': 69632, 'base_url': 'http://127.0.0.1:8000/v1/chat/completions', 'num_concurrent': 32, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 3600}", |
| 22 | "num_seeds_merged": "3" |
| 23 | } |
| 24 | }, |
| 25 | "evaluation_results": [ |
| 26 | { |
| 27 | "evaluation_name": "mmlu_pro_chat/custom-extract", |
| 28 | "source_data": { |
| 29 | "dataset_name": "mmlu_pro_chat", |
| 30 | "source_type": "other" |
| 31 | }, |
| 32 | "metric_config": { |
| 33 | "evaluation_description": "exact_match (filter: custom-extract)", |
| 34 | "lower_is_better": false, |
| 35 | "score_type": "continuous", |
| 36 | "min_score": 0.0, |
| 37 | "max_score": 1.0 |
| 38 | }, |
| 39 | "score_details": { |
| 40 | "score": 0.8544160017730497, |
| 41 | "details": { |
| 42 | "seed_scores": "[0.8532247340425532, 0.85546875, 0.8545545212765957]", |
| 43 | "evaluation_timestamps": "[1781533280, 1781536024, 1781538914]" |
| 44 | }, |
| 45 | "uncertainty": { |
| 46 | "standard_error": { |
| 47 | "value": 0.0006514836001942016, |
| 48 | "method": "across_seeds" |
| 49 | }, |
| 50 | "num_samples": 3 |
| 51 | } |
| 52 | } |
| 53 | } |
| 54 | ] |
| 55 | } |